pax_global_header00006660000000000000000000000064146326303150014515gustar00rootroot0000000000000052 comment=2a081972b20911ddf76a6b40df717c7d0c181268 CLBlast-1.6.3/000077500000000000000000000000001463263031500130105ustar00rootroot00000000000000CLBlast-1.6.3/.appveyor.yml000066400000000000000000000037711463263031500154660ustar00rootroot00000000000000environment: global: CLBLAST_BUILD: "C:\\clblast\\build" OPENCL_ROOT: "C:\\dependencies\\opencl" platform: - x64 configuration: - Release init: - cmake --version - C:\"Program Files (x86)"\"Microsoft Visual Studio 14.0"\VC\vcvarsall.bat %PLATFORM% # Creates an OpenCL library to link against. Taken from clMathLibraries/clBLAS install: - ps: mkdir $env:OPENCL_ROOT - ps: pushd $env:OPENCL_ROOT # This downloads the source to the Khronos ICD library - git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader.git - ps: pushd OpenCL-ICD-Loader - git checkout cb4acb9 # older version (pre 2.2 support) - ps: popd - ps: mv ./OpenCL-ICD-Loader/* . # This downloads all the opencl header files # The cmake build files expect a directory called inc - ps: mkdir inc/CL - git clone https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL - ps: wget https://raw.githubusercontent.com/KhronosGroup/OpenCL-CLHPP/main/include/CL/opencl.hpp -OutFile inc/CL/cl.hpp # Switch to OpenCL 2.1 headers - ps: pushd inc/CL - git checkout bf0f43b # OpenCL 2.1 - ps: popd # - ps: dir; if( $lastexitcode -eq 0 ){ dir include/CL } else { Write-Output boom } # Create the static import lib in a directory called lib, so findopencl() will find it - ps: mkdir lib - ps: pushd lib - cmake -G "NMake Makefiles" .. - nmake - ps: popd # Rename the inc directory to include, so FindOpencl() will find it - ps: ren inc include - ps: popd before_build: - ps: mkdir $env:CLBLAST_BUILD - ps: pushd $env:CLBLAST_BUILD - ps: mkdir install_dir - cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=install_dir -DCMAKE_BUILD_TYPE=%CONFIGURATION% -DTESTS=ON -DCLIENTS=ON -DSAMPLES=ON -DNETLIB=ON %APPVEYOR_BUILD_FOLDER% build_script: - nmake - nmake install after_build: - ps: pushd $env:CLBLAST_BUILD - 7z a CLBlast-1.6.3-Windows-x64.zip .\install_dir\* - ps: mv CLBlast-1.6.3-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER artifacts: - path: '*.zip' name: release type: zip CLBlast-1.6.3/.github/000077500000000000000000000000001463263031500143505ustar00rootroot00000000000000CLBlast-1.6.3/.github/FUNDING.yml000066400000000000000000000000221463263031500161570ustar00rootroot00000000000000github: CNugteren CLBlast-1.6.3/.github/workflows/000077500000000000000000000000001463263031500164055ustar00rootroot00000000000000CLBlast-1.6.3/.github/workflows/build_and_test.yml000066400000000000000000000050731463263031500221150ustar00rootroot00000000000000name: CLBlast build on: pull_request: {} push: branches: ['master'] jobs: build_and_test_linux_and_macos: strategy: matrix: config: [ {os: ubuntu-latest, c_compiler: gcc, cpp_compiler: g++}, {os: ubuntu-latest, c_compiler: clang, cpp_compiler: clang++}, {os: macos-13, c_compiler: clang, cpp_compiler: clang++}, ] runs-on: ${{ matrix.config.os }} steps: - uses: actions/checkout@v3 - name: Install requirements for Ubuntu run: | sudo apt-get update sudo apt-get install -yq ninja-build ocl-icd-opencl-dev opencl-c-headers libopenblas-dev --no-install-recommends if: ${{ matrix.config.os == 'ubuntu-latest' }} - name: Install requirements for macOS run: brew install ninja if: ${{ matrix.config.os == 'macos-13' }} - name: Run CMake run: | export CC=${{ matrix.config.c_compiler }} export CXX=${{ matrix.config.cpp_compiler }} cmake -S . -B build -G Ninja -DTESTS=ON -DCLIENTS=ON -DSAMPLES=ON - name: Compile the code run: cmake --build build - name: Get the diagnostics info run: ./build/clblast_test_diagnostics if: ${{ matrix.config.os == 'macos-13' }} - name: Run an example client run: ./build/clblast_client_xgemm if: ${{ matrix.config.os == 'macos-13' }} - name: Run an example sample program run: ./build/clblast_sample_dgemv_c if: ${{ matrix.config.os == 'macos-13' }} - name: Run an example tuner run: ./build/clblast_tuner_xdot if: ${{ matrix.config.os == 'macos-13' }} - name: Run the unittests run: ctest --test-dir build if: ${{ matrix.config.os == 'macos-13' }} build_windows: strategy: matrix: config: [ {os: windows-2019, arch: x64}, ] runs-on: ${{ matrix.config.os }} steps: - uses: actions/checkout@v3 - name: Set up MSVC uses: ilammy/msvc-dev-cmd@v1 - name: Install OpenBLAS run: | mkdir openblas cd openblas C:\msys64\usr\bin\wget.exe https://github.com/xianyi/OpenBLAS/releases/download/v0.3.23/OpenBLAS-0.3.23-x64.zip 7z x OpenBLAS-0.3.23-x64.zip pwd ls - name: Install OpenCL run: vcpkg.exe --triplet=${{ matrix.config.arch }}-windows install opencl - name: Run CMake run: cmake -S . -B build -DTESTS=ON -DCLIENTS=ON -DSAMPLES=ON -DOPENCL_ROOT=C:\vcpkg\packages\opencl_x64-windows -DCBLAS_ROOT=${{ github.workspace }}\openblas - name: Compile the code run: cmake --build build CLBlast-1.6.3/.github/workflows/release.yml000066400000000000000000000052711463263031500205550ustar00rootroot00000000000000name: CLBlast release on: workflow_dispatch: inputs: version: description: "Version of the form 1.5.3" required: true jobs: release_linux_and_macos: strategy: matrix: config: [ {name: linux, os: ubuntu-20.04, arch: x86_64, c_compiler: gcc-9, cpp_compiler: g++-9}, {name: macos, os: macos-11, arch: x86_64, c_compiler: clang, cpp_compiler: clang++}, ] runs-on: ${{ matrix.config.os }} env: RELEASE_NAME: CLBlast-${{ github.event.inputs.version }}-${{ matrix.config.name }}-${{ matrix.config.arch }} steps: - uses: actions/checkout@v3 - name: Install requirements for Ubuntu run: | sudo apt-get update sudo apt-get install -yq ocl-icd-opencl-dev opencl-c-headers --no-install-recommends if: ${{ matrix.config.name == 'linux' }} - name: Run CMake run: | mkdir ${{env.RELEASE_NAME}} export CC=${{ matrix.config.c_compiler }} export CXX=${{ matrix.config.cpp_compiler }} cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DTESTS=OFF -DCLIENTS=OFF -DSAMPLES=ON -DCMAKE_INSTALL_PREFIX=${{env.RELEASE_NAME}} - name: Compile the code run: cmake --build build - name: Package the code run: | cmake --build build --target install tar -cvzf ${{env.RELEASE_NAME}}.tar.gz ${{env.RELEASE_NAME}} - name: Upload the release uses: actions/upload-artifact@v3 with: name: ${{env.RELEASE_NAME}} path: ${{env.RELEASE_NAME}}.tar.gz release_windows: strategy: matrix: config: [ {name: windows, os: windows-2019, arch: x64}, ] runs-on: ${{ matrix.config.os }} env: RELEASE_NAME: CLBlast-${{ github.event.inputs.version }}-${{ matrix.config.name }}-${{ matrix.config.arch }} steps: - uses: actions/checkout@v3 - name: Set up MSVC uses: ilammy/msvc-dev-cmd@v1 - name: Install OpenCL run: vcpkg.exe --triplet=${{ matrix.config.arch }}-windows install opencl - name: Run CMake run: | mkdir "${{env.RELEASE_NAME}}" cmake -S . -B build -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DTESTS=OFF -DCLIENTS=OFF -DSAMPLES=ON -DCMAKE_INSTALL_PREFIX="${{env.RELEASE_NAME}}" -DOPENCL_ROOT=C:/vcpkg/packages/opencl_x64-windows - name: Compile the code run: cmake --build build --config Release - name: Package the code run: | cmake --build build --target install 7z a -r ${{env.RELEASE_NAME}}.7z "${{env.RELEASE_NAME}}" - name: Upload the release uses: actions/upload-artifact@v3 with: name: ${{env.RELEASE_NAME}} path: ${{env.RELEASE_NAME}}.7z CLBlast-1.6.3/.gitignore000066400000000000000000000001451463263031500150000ustar00rootroot00000000000000build stash .* *.pyc database.json database_best.json cl.hpp opencl.hpp src/pyclblast/dist *.egg-infoCLBlast-1.6.3/CHANGELOG000066400000000000000000000421031463263031500142220ustar00rootroot00000000000000Development version (next version) - (no changes yet since last release) Version 1.6.3 - Fixed a bug in the GEMMK=1 kernel (with 2D register tiling) when MWG!=NWG - CMake fixes for older versions and for the CUDA backend - Added tuned parameters for many devices (see doc/tuning.md) Version 1.6.2 - Fix a bug in the pre-processor that would cause issues on Arm GPUs - Fix DLL install directory in mingw - Modifications to the Python bindings (pyclblast) * Convert float scalar values to cl_half for fp16 routines * Amax/amin, max/min routines accept unsigned integer buffers for index * Switch to pyproject.toml file for installing Python bindings * Build Python bindings using Cmake, adding Windows support - Generator script now always use LF endings, independent of the platform - Added tuned parameters for many devices (see doc/tuning.md) Version 1.6.1 - Fix pointer error in pyclblast on Arm - Fix a multithreading bug related to storing objects in the cache - Added tuned parameters for many devices (see doc/tuning.md) Version 1.6.0 - Modifications to improve performance on Qualcomm Adreno GPUs: * Unique database entries for specific Adreno devices * Toggle OpenCL kernel compilation options for Adreno * New preprocessor directive RELAX_WORKGROUP_SIZE - Fixed a bug in handling of #undef in CLBlast loop unrolling and array-to-register mapping functions - Fixed a bug in XAMAX/XAMIN routines related to inadvertently including the increment and offset in the result - Fixed a bug in XAMAX/XAMIN routines that would cause only the real part of a complex number to be taken into account - Fixed a bug that caused tests to not properly do integer-output testing (for XAMAX/XAMIN) - Fixes a minor issue with the expected input buffer size in the TRMV/TBMV/TPMV/TRSV routines - Fixes an issue with crashes on Android related to calling clReleaseProgram - Fixes two small issues in the plotting script - Fixed a documentation bug in the 'ld' requirements - Enabled Github Actions CI builds for testing and releasing - Various minor fixes and enhancements - Added tuned parameters for various devices (see doc/tuning.md) Version 1.5.3 - Fix a correctness issue with DGEMM on SM 7.5 Turing GPUs - Various minor fixes and enhancements - Added tuned parameters for various devices (see doc/tuning.md) - Update cl.hpp to the new opencl.hpp header in the samples - Changed the complex sum routine to return the complex sum instead of the absolute complex sum. Version 1.5.2 - Changed XAMAX/XAMIN to more likely return first rather than last min/max index, updated API docs - Added batched routines to pyclblast - Added CLBLAST_VERSION_MAJOR/MINOR/PATCH defines in headers to store version numbering - Several small improvements to the benchmark script (thanks to 'baryluk') - Fixed a bug in the caching when using a context with multiple devices - Fixed a bug in the tuners related to global workgroup size not being a multiple of the local - Various minor fixes and enhancements - Added tuned parameters for various devices (see doc/tuning.md) Version 1.5.1 - Implemented single-kernel version of convolution as GEMM - Now catches all exceptions thrown by the tuners - Fixed a bug in ISAMIN kernel - Fixed an out-of-bounds read/write in the XHAD routine (thanks to etomzak) - Various minor fixes and enhancements - Added tuned parameters for various devices (see doc/tuning.md) Version 1.5.0 - Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah') - Added an option to compile the Netlib API with static OpenCL device and context (-DNETLIB_PERSISTENT_OPENCL=ON) - Added a FAQ page to the documentation - The tuners now check beforehand on invalid local thread sizes and skip those completely - Made the tuning API (OverrideParameters) more flexible, disregarding superfluous parameters - Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel - Fixed an issue with the preprocessor and the new GEMMK == 1 kernel - Fixed an issue for unequal MWG and NWG and the new GEMMK == 1 kernel - Fixed an issue for certain parameters for AXPY's 'XaxpyFaster' kernel - Various minor fixes and enhancements - Added non-BLAS routines: * SCONVGEMM/DCONVGEMM/HCONVGEMM (convolution as im2col followed by batched GEMM) * SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM (col2im transform as used in machine learning) Version 1.4.1 - Fixed an access violation under Windows upon releasing the OpenCL program when the driver is already unloaded - Fixed an issue with double cl_program release in the CLBlast caching system - Added tuned parameters for various devices (see doc/tuning.md) Version 1.4.0 - Added Python interface to CLBlast 'PyCLBlast' - Added CLBlast to Ubuntu PPA and macOS Homebrew package managers - Added an API to run the tuners programmatically without any I/O - Improved the performance potential by adding a second tunable GEMM kernel with 2D register tiling - Added support for Intel specific subgroup shuffling extensions for faster GEMM on Intel GPUs - Re-added a local memory size constraint to the tuners - The routine tuners now automatically pick up tuning results from disk from the kernel tuners - Updated and reorganised the CLBlast documentation - Added a 'canary' region to check for overflows in the tuner and tests (inspired by clARMOR) - Added an option to test against and compare performance with Intel's MKL - Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program - Fixed incorrect releasing of the OpenCL program resulting in segfaults / access violations - Various minor fixes and enhancements - Added tuned parameters for various devices (see doc/tuning.md) - Added non-BLAS level-1 routines: * SHAD/DHAD/CHAD/ZHAD/HHAD (Hadamard element-wise vector-vector product) Version 1.3.0 - Re-designed and integrated the auto-tuner, no more dependency on CLTune - Made it possible to override the tuning parameters in the clients straight from JSON tuning files - Added OpenCL pre-processor to unroll loops and perform array-to-register promotions for compilers which don't do this themselves (ARM Mali) - greatly improves performance on these platforms - Added first tuners for the TRSV (block size) and TRSM (invert kernel) routines - Added an optional argument to the GEMM routine to provide a pre-allocated temporary buffer - Fixed an issue with a crashing/hanging AMD APP compiler with the TRSM routine (invert kernel) - Improved compilation time by splitting the tuning database into multiple compilation units - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) - Added the RetrieveParameters function to the API to be able to inspect the tuning parameters - Added a strided-batched (not part of the BLAS standard) routine, faster but less generic compared to the existing xGEMMBATCHED routines: * SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED Version 1.2.0 - Fixed a bug in the TRSM/TRSV routines due to missing synchronisations after GEMM/GEMV calls - Fixed a bug in TRSM when using the a-offset argument - Added a CUDA API to CLBlast: * The library and kernels can be compiled with the CUDA driver API and NVRTC (requires CUDA 7.5) * Two CUDA API sample programs are added: SGEMM and DAXPY * All correctness tests and performance clients work on CUDA like they did for OpenCL - Kernels are now cached based on their tuning parameters: fits the use-case of 'OverrideParameters' - Cross-compiling for Android is now supported using CMake; instructions are added to the README - Improved performance for small GEMM problems by going from 3 to 1 optional temporary buffers - GEMM kernel selection (direct vs in-direct) is now done automatically using a new tuner - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) Version 1.1.0 - The tuning database now has defaults per architecture (e.g. NVIDIA Kepler SM3.5, AMD Fiji) - The tuning database now has a dictionary to translate vendor/device names to a common set - The tuners can now distinguish between different AMD GPU board names of the same architecture - The tuners can now use particle-swarm optimisation to search more efficiently (thanks to 'mcian') - Improved performance for small problems on NVIDIA hardware by caching the device name - Further improved compilation time of database.cpp - Added a small diagnostics helper executable - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) - Added non-BLAS routines: * SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL (im2col transform as used to express convolution as GEMM) Version 1.0.1 - Fixed a bug in the direct version of the GEMM kernel Version 1.0.0 - Fixed a bug in the TRSM routine for alpha != 1 - Fixed a bug in the cache related to multi-device contexts (thanks to 'kpot') - Fixed a bug in the direct version of the GEMM kernel - Fixed several warnings for MSVC and Clang - Added support for Mesa Clover and AMD's ROCm by making the inline keyword optional in kernels - Performance reports are now external at https://cnugteren.github.io/clblast - Greatly improved compilation time of database.cpp - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) - Added non-BLAS level-1 routines: * iSAMIN/iDAMIN/iCAMIN/iZAMIN (absolute minimum version of the ixAMAX BLAS routines) Version 0.11.0 - Improved the internal program source and binary caches for scalability and speed (thanks to 'intelfx') - Fixed a bug having to re-create the binary even if it was in the cache - Fixed a bug when using offsets in the direct version of the GEMM kernels - Fixed a missing cl_khr_fp64 when running double-precision on Intel CPUs - Fixed tests on Apple's CPU OpenCL implementation; still not fast but correct at least - Fixed bugs in the half-precision routines HTBMV/HTPMV/HTRMV/HSYR2K/HTRMM - Tests now also exit with an error code when OpenCL errors or compilation errors occur - Tests now also check for the L2 error in case of half-precision - Clients can now test against cuBLAS on NVIDIA systems for performance comparisons (-DCUBLAS=ON) - Replaced the R graph scripts with Python/Matplotlib scripts - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) - Added the OverrideParameters function to the API to be able to supply custom tuning parameters - Added triangular solver (level-2 & level-3) routines: * STRSV/DTRSV/CTRSV/ZTRSV (experimental, un-optimized) * STRSM/DTRSM/CTRSM/ZTRSM (experimental, un-optimized) - Added batched (not part of the BLAS standard) routines: * SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED (batched version of AXPY) * SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED (batched version of GEMM) Version 0.10.0 - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header - Changed the enums in the C API to avoid potential name clashes with external code - Added a Netlib CBLAS compatible API (not recommended for full control over performance) - Greatly improved the way exceptions are handled in the library (thanks to 'intelfx') - Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation - Fixed a bug in the tests and samples related to waiting for an invalid event - Fixed a bug in the SYRK/SYR2K/HERK/HER2K routines that would occur with specific tuning parameters - Fixed a bug in the TRMM routine that would overwrite input data before consuming everything - Added support for compilation under Visual Studio 2013 (MSVC++ 12.0) - Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS - Added an option to run tuned kernels multiple times to average execution times - Added an option to build a static version of the library - Made it possible to use the command-line environmental vars everywhere and without re-running CMake - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) Version 0.9.0 - Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header - Improved performance significantly of rotated GEMV computations - Improved performance of unseen/un-tuned devices by a better default tuning parameter selection - Fixed proper MSVC dllimport and dllexport declarations - Fixed memory leaks related to events not being released - Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems - Fixed a bug related to the cache and retrieval of programs based on the OpenCL context - Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels - Fixed a bug in the OpenCL kernels: now placing __kernel before __attribute__ - Fixed a bug in level-3 routines when beta is zero and matrix C contains NaNs - Added an option (-warm_up) to do a warm-up run before timing in the performance clients - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) Version 0.8.0 - Added support for half-precision floating-point (fp16) in the library - Made it possible to compile the performance tests (clients) separately from the correctness tests - Made a reference BLAS and head-to-head performance comparison optional in the clients - Increased the verbosity of the "-verbose" option in the correctness tests - Refactored the host code for better compilation times and fewer lines of code - Added Appveyor continuous integration and increased coverage of the Travis builds - Improved the API documentation - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) - Added half-precision routines: * Level-1: HSWAP/HSCAL/HCOPY/HAXPY/HDOT/HNRM2/HASUM/HSUM/iHAMAX/iHMAX/iHMIN * Level-2: HGEMV/HGBMV/HHEMV/HHBMV/HHPMV/HSYMV/HSBMV/HSPMV/HTRMV/HTBMV/HTPMV/HGER/HSYR/HSPR/HSYR2/HSPR2 * Level-3: HGEMM/HSYMM/HSYRK/HSYR2K/HTRMM - Added non-BLAS routines: * SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY (matrix copy, scaling, and/or transpose) Version 0.7.1 - Improved performance of large power-of-2 xGEMM kernels for AMD GPUs - Fixed a bug in the xGEMM routine related to the event incorrectly set - Made MSVC link the run-time libraries statically Version 0.7.0 - Added exports to be able to create a DLL on Windows (thanks to Marco Hutter) - Made the library thread-safe - Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries - Fixed the use of events within the library - Changed the enum parameters to match the raw values of the cblas standard - Fixed the cache of previously compiled binaries and added a function to fill or clear it - Various minor fixes and enhancements - Added a preliminary version of the API documentation - Added additional sample programs - Added tuned parameters for various devices (see README) - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 * SASUM/DASUM/ScASUM/DzASUM * SSUM/DSUM/ScSUM/DzSUM (non-absolute version of the above xASUM BLAS routines) * iSAMAX/iDAMAX/iCAMAX/iZAMAX * iSMAX/iDMAX/iCMAX/iZMAX (non-absolute version of the above ixAMAX BLAS routines) * iSMIN/iDMIN/iCMIN/iZMIN (non-absolute minimum version of the above ixAMAX BLAS routines) Version 0.6.0 - Added support for MSVC (Visual Studio) 2015 - Added tuned parameters for various devices (see README) - Now automatically generates C++ code from JSON tuning results - Added level-2 routines: * SGER/DGER * CGERU/ZGERU * CGERC/ZGERC * CHER/ZHER * CHPR/ZHPR * CHER2/ZHER2 * CHPR2/ZHPR2 * CSYR/ZSYR * CSPR/ZSPR * CSYR2/ZSYR2 * CSPR2/ZSPR2 Version 0.5.0 - Improved structure and performance of level-2 routines (xSYMV/xHEMV) - Reduced compilation time of level-3 OpenCL kernels - Added level-1 routines: * SSWAP/DSWAP/CSWAP/ZSWAP * SSCAL/DSCAL/CSCAL/ZSCAL * SCOPY/DCOPY/CCOPY/ZCOPY * SDOT/DDOT * CDOTU/ZDOTU * CDOTC/ZDOTC - Added level-2 routines: * SGBMV/DGBMV/CGBMV/ZGBMV * CHBMV/ZHBMV * CHPMV/ZHPMV * SSBMV/DSBMV * SSPMV/DSPMV * STRMV/DTRMV/CTRMV/ZTRMV * STBMV/DTBMV/CTBMV/ZTBMV * STPMV/DTPMV/CTPMV/ZTPMV Version 0.4.0 - Now using the Claduc C++11 interface to OpenCL - Added plain C API for increased compatibility (clblast_c.h) - Re-organized tuner infrastructure and added JSON output - Removed clBLAS sources, it should now be installed separately for testing - Added Travis continuous integration - Added level-2 routines: * CHEMV/ZHEMV * SSYMV/DSYMV Version 0.3.0 - Re-organized test/client infrastructure to avoid code duplication - Added an optional bypass for pre/post-processing kernels in level-3 routines - Significantly improved performance of level-3 routines on AMD GPUs - Added level-3 routines: * CHEMM/ZHEMM * SSYRK/DSYRK/CSYRK/ZSYRK * CHERK/ZHERK * SSYR2K/DSYR2K/CSYR2K/ZSYR2K * CHER2K/ZHER2K * STRMM/DTRMM/CTRMM/ZTRMM Version 0.2.0 - Added support for complex conjugate transpose - Several host-code performance improvements - Improved testing infrastructure and coverage - Added level-2 routines: * SGEMV/DGEMV/CGEMV/ZGEMV - Added level-3 routines: * CGEMM/ZGEMM * CSYMM/ZSYMM Version 0.1.0 - Initial preview version release to GitHub - Supported level-1 routines: * SAXPY/DAXPY/CAXPY/ZAXPY - Supported level-3 routines: * SGEMM/DGEMM * SSYMM/DSYMM CLBlast-1.6.3/CMakeLists.txt000066400000000000000000000660531463263031500155620ustar00rootroot00000000000000 # ================================================================================================== # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This # project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- # width of 100 characters per line. # # Author(s): # Cedric Nugteren # # ================================================================================================== cmake_minimum_required(VERSION 2.8.11) # Overrides for MSVC static runtime option(OVERRIDE_MSVC_FLAGS_TO_MT "Override compiler flags for MSVC to build with a static runtime (/MT instead of /MD)" ON) if(OVERRIDE_MSVC_FLAGS_TO_MT) set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake) set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_flag_overrides.cmake) endif() # CMake project details project("clblast" C CXX) set(clblast_VERSION_MAJOR 1) set(clblast_VERSION_MINOR 6) set(clblast_VERSION_PATCH 3) set(clblast_VERSION "${clblast_VERSION_MAJOR}.${clblast_VERSION_MINOR}.${clblast_VERSION_PATCH}") set(clblast_SOVERSION ${clblast_VERSION_MAJOR}) # Policies IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") cmake_policy(SET CMP0074 NEW) # to make -DCBLAS_ROOT= work with newer CMake versions as well ENDIF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") # Options and their default values option(BUILD_SHARED_LIBS "Build a shared (ON) or static library (OFF)" ON) option(SAMPLES "Enable compilation of the examples" OFF) option(TUNERS "Enable compilation of the tuners" ON) option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF) option(TESTS "Enable compilation of the correctness tests" OFF) option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF) # The optional Netlib API for CLBlast option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF) option(NETLIB_PERSISTENT_OPENCL "Makes OpenCL device and context in the CBLAS Netlib API static" OFF) if(NETLIB) message("-- Building the Netlib API of CLBlast") if(NETLIB_PERSISTENT_OPENCL) message(" ^^ while using static variables for OpenCL device and context") add_definitions(-DNETLIB_PERSISTENT_OPENCL) endif() endif() # Workarounds for bugs option(AMD_SI_EMPTY_KERNEL_WORKAROUND "Enables workaround for bug in AMD Southern Island GPUs" OFF) if(AMD_SI_EMPTY_KERNEL_WORKAROUND) add_definitions(-DAMD_SI_EMPTY_KERNEL_WORKAROUND) endif() # Select between an OpenCL API (default) or a CUDA API (beta) option(OPENCL "Build CLBlast with an OpenCL API (default)" ON) option(CUDA "Build CLBlast with a CUDA API (beta)" OFF) if(NOT OPENCL AND NOT CUDA) message(FATAL_ERROR "No API selected, choose from OpenCL (-DOPENCL=ON) or CUDA (-DCUDA=ON)") endif() if(OPENCL AND CUDA) message(FATAL_ERROR "Multiple APIs selected, choose either OpenCL (-DOPENCL=ON -DCUDA=OFF) or CUDA (-DCUDA=ON -DOPENCL=OFF)") endif() if(OPENCL) message("-- Building CLBlast with OpenCL API (default)") add_definitions(-DOPENCL_API) elseif(CUDA) message("-- Building CLBlast with CUDA API (beta)") add_definitions(-DCUDA_API) endif() # Compile in verbose mode with additional diagnostic messages option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF) if(VERBOSE) message("-- Building in verbose mode") add_definitions(-DVERBOSE) endif() # ================================================================================================== # RPATH settings set(CMAKE_MACOSX_RPATH 1) # ================================================================================================== # Compiler-version check (requires at least CMake 2.8.10) if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) message(FATAL_ERROR "GCC version must be at least 4.7") endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL Clang) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) message(FATAL_ERROR "Clang version must be at least 3.3") endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) message(FATAL_ERROR "AppleClang version must be at least 5.0") endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) message(FATAL_ERROR "ICC version must be at least 14.0") endif() elseif(MSVC) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) message(FATAL_ERROR "MS Visual Studio version must be at least 18.0") endif() endif() # DLL Settings if(MSVC) if(BUILD_SHARED_LIBS) add_definitions(" /DCLBLAST_DLL") endif() endif(MSVC) # C++ compiler settings if(MSVC) set(FLAGS "/Ot") set(FLAGS "${FLAGS} /wd4715 /D_CRT_SECURE_NO_WARNINGS") else() set(FLAGS "-std=c++11") if(VERBOSE) set(FLAGS "${FLAGS} -O1 -g") else() set(FLAGS "${FLAGS} -O2") endif() if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn -Wno-unused-function") if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0) set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable") endif() if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0.0) # GCC does not support attributes on template arguments # in particular we hit this with the alignment attributes on cl_XXX types # which are then used to instantiate various templates in CLBlast set(FLAGS "${FLAGS} -Wno-ignored-attributes") endif() elseif(CMAKE_CXX_COMPILER_ID MATCHES Clang) set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch") set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn") set(FLAGS "${FLAGS} -Wno-deprecated-declarations -Wno-unused-function") if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 3.9.0) # clang 4.0 or higher if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0) # but not for AppleClang set(FLAGS "${FLAGS} -Wno-undefined-var-template") endif() endif() endif() endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") # C compiler settings (for the sample) if(MSVC) set(CFLAGS "/Ot") else() set(CFLAGS "-O2 -std=c99") endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}") # ================================================================================================== # Package scripts location set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${clblast_SOURCE_DIR}/cmake/Modules/") if(OPENCL) # Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH. find_package(OpenCL REQUIRED) set(API_LIBRARIES ${OPENCL_LIBRARIES}) set(API_INCLUDE_DIRS ${OPENCL_INCLUDE_DIRS}) elseif(CUDA) # For CUDA, the "FindCUDA.cmake" is part of CMake find_package(CUDA REQUIRED) set(API_LIBRARIES cuda nvrtc) set(API_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64) endif() # Don't search for system libraries when cross-compiling if(${CMAKE_SYSTEM_NAME} STREQUAL Android) if(TESTS) message(STATUS "Compilation of the tests disabled for the Android target") set(TESTS OFF) endif() if(CLIENTS) message(STATUS "Head-to-head performance comparison not supported in the clients for the Android target") endif() else() # Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake", # "FindCBLAS.cmake", "FindMKL.cmake", and "FindcuBLAS.cmake" are included. if(CLIENTS OR TESTS) find_package(CBLAS) find_package(MKL) if(OPENCL) find_package(clBLAS) endif() if(CUBLAS) find_package(cuBLAS) endif() if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND AND NOT MKL_FOUND) if(TESTS) message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests") set(TESTS OFF) endif() if(CLIENTS) message(STATUS "Could NOT find clBLAS nor a CPU BLAS, head-to-head performance comparison not supported in the clients") endif() endif() endif() endif() # ================================================================================================== # Sets the supported routines and the used kernels. New routines and kernels should be added here. set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemm_direct xgemv invert xconvgemm) set(DATABASES copy pad padtranspose transpose xaxpy xdot xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger invert gemm_routine trsv_routine xconvgemm) set(ROUTINE_TUNERS xgemm xtrsv) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm) set(LEVELX_ROUTINES xhad xomatcopy xim2col xcol2im xconvgemm xaxpybatched xgemmbatched xgemmstridedbatched) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES}) set(PRECISIONS 32 64 3232 6464 16) # Sample programs if(OPENCL) set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched dtrsm tuning_api) set(SAMPLE_PROGRAMS_C sasum samax dgemv sgemm haxpy cache) if(NETLIB) set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib) endif() elseif(CUDA) set(SAMPLE_PROGRAMS_CPP daxpy_cuda sgemm_cuda) set(SAMPLE_PROGRAMS_C ) endif() # ================================================================================================== # Gathers all source-files (required for the compiler) and header-files (for IDEs only) set(SOURCES src/database/database.cpp src/routines/common.cpp src/utilities/compile.cpp src/utilities/clblast_exceptions.cpp src/utilities/timing.cpp src/utilities/utilities.cpp src/api_common.cpp src/cache.cpp src/kernel_preprocessor.cpp src/routine.cpp src/routines/levelx/xinvert.cpp # only source, don't include it as a test src/tuning/configurations.cpp ) set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio include/clblast_half.h src/database/apple_cpu_fallback.hpp src/database/database.hpp src/database/database_structure.hpp src/routines/level1/xamin.hpp src/routines/level1/xmax.hpp src/routines/level1/xmin.hpp src/routines/level1/xsum.hpp src/routines/common.hpp src/routines/routines.hpp src/utilities/buffer_test.hpp src/utilities/compile.hpp src/utilities/clblast_exceptions.hpp src/utilities/device_mapping.hpp src/utilities/msvc.hpp src/utilities/timing.hpp src/utilities/utilities.hpp src/cache.hpp src/kernel_preprocessor.hpp src/cxpp11_common.hpp src/routine.hpp src/tuning/configurations.hpp src/tuning/tuning.hpp src/tuning/routines/routine_tuner.hpp ) if(OPENCL) set(SOURCES ${SOURCES} src/clblast.cpp src/clblast_c.cpp src/tuning/tuning_api.cpp) set(HEADERS ${HEADERS} include/clblast.h include/clblast_c.h src/clpp11.hpp) if(NETLIB) set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp) set(HEADERS ${HEADERS} include/clblast_netlib_c.h) endif() elseif(CUDA) set(SOURCES ${SOURCES} src/clblast_cuda.cpp) set(HEADERS ${HEADERS} include/clblast_cuda.h src/cupp11.hpp) endif() foreach(ROUTINE ${LEVEL1_ROUTINES}) set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp) set(HEADERS ${HEADERS} src/routines/level1/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${LEVEL2_ROUTINES}) set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cpp) set(HEADERS ${HEADERS} src/routines/level2/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${LEVEL3_ROUTINES}) set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cpp) set(HEADERS ${HEADERS} src/routines/level3/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${LEVELX_ROUTINES}) set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cpp) set(HEADERS ${HEADERS} src/routines/levelx/${ROUTINE}.hpp) endforeach() foreach(DATABASE ${DATABASES}) set(SOURCES ${SOURCES} src/database/kernels/${DATABASE}/${DATABASE}.cpp) set(HEADERS ${HEADERS} src/database/kernels/${DATABASE}/${DATABASE}.hpp) set(HEADERS ${HEADERS} src/database/kernels/${DATABASE}/${DATABASE}_16.hpp) set(HEADERS ${HEADERS} src/database/kernels/${DATABASE}/${DATABASE}_32.hpp) set(HEADERS ${HEADERS} src/database/kernels/${DATABASE}/${DATABASE}_64.hpp) set(HEADERS ${HEADERS} src/database/kernels/${DATABASE}/${DATABASE}_3232.hpp) set(HEADERS ${HEADERS} src/database/kernels/${DATABASE}/${DATABASE}_6464.hpp) endforeach() foreach(KERNEL ${KERNELS}) set(HEADERS ${HEADERS} src/tuning/kernels/${KERNEL}.hpp) endforeach() # Creates and links the library if(BUILD_SHARED_LIBS) add_library(clblast SHARED ${SOURCES} ${HEADERS}) else(BUILD_SHARED_LIBS) add_library(clblast STATIC ${SOURCES} ${HEADERS}) endif() set_target_properties(clblast PROPERTIES VERSION ${clblast_VERSION} SOVERSION ${clblast_SOVERSION}) target_link_libraries(clblast ${API_LIBRARIES}) # Includes directories: CLBlast and OpenCL target_include_directories(clblast PUBLIC $ $ $ ${API_INCLUDE_DIRS}) # Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built if(MSVC) if(BUILD_SHARED_LIBS) target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11 endif() endif() # Installs the library include(GNUInstallDirs) install(TARGETS clblast EXPORT CLBlast LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) install(FILES include/clblast_half.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) if(OPENCL) install(FILES include/clblast.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install(FILES include/clblast_c.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) if(NETLIB) install(FILES include/clblast_netlib_c.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() elseif(CUDA) install(FILES include/clblast_cuda.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() # Installs the config for find_package in dependent projects install(EXPORT CLBlast DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/CLBlast FILE CLBlastConfig.cmake) # Install pkg-config file on Linux if(UNIX OR MINGW) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/clblast.pc.in" "${CMAKE_CURRENT_BINARY_DIR}/clblast.pc" @ONLY IMMEDIATE) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/clblast.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) endif() # ================================================================================================== # This section contains all the code related to the examples if(SAMPLES) # Downloads the opencl.hpp file from Khronos if(OPENCL) file(DOWNLOAD https://raw.githubusercontent.com/KhronosGroup/OpenCL-CLHPP/main/include/CL/opencl.hpp ${clblast_SOURCE_DIR}/samples/opencl.hpp) endif() # Adds sample programs (C++) foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP}) add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp) target_link_libraries(clblast_sample_${SAMPLE} clblast ${API_LIBRARIES}) install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin) endforeach() # Adds sample programs (C) foreach(SAMPLE ${SAMPLE_PROGRAMS_C}) add_executable(clblast_sample_${SAMPLE}_c samples/${SAMPLE}.c) target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${API_LIBRARIES}) install(TARGETS clblast_sample_${SAMPLE}_c DESTINATION bin) endforeach() endif() # ================================================================================================== # This section contains all the code related to the tuners if(TUNERS) set(TUNERS_COMMON src/utilities/compile.cpp src/utilities/clblast_exceptions.cpp src/utilities/timing.cpp src/utilities/utilities.cpp src/tuning/configurations.cpp src/tuning/tuning.cpp src/kernel_preprocessor.cpp) set(TUNERS_HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio src/utilities/compile.hpp src/utilities/clblast_exceptions.hpp src/utilities/timing.hpp src/utilities/utilities.hpp src/tuning/configurations.hpp src/tuning/tuning.hpp src/tuning/routines/routine_tuner.hpp src/kernel_preprocessor.hpp) set(TUNERS_COMMON ${TUNERS_COMMON} ${TUNERS_HEADERS}) # Creates a library with common sources for all tuners if(MSVC) # Visual Studio requires the sources of non-exported objects/libraries else() # Creates the common performance-tests objects (requires CMake 2.8.8) add_library(tuners_common_library OBJECT ${TUNERS_COMMON}) # Adds CLBlast's interface include paths because we can't link to CLBlast here target_include_directories(tuners_common_library PRIVATE $ ${clblast_SOURCE_DIR} ${API_INCLUDE_DIRS}) set(TUNERS_COMMON $) endif() # Adds tuning executables set(ALLKERNELS ${KERNELS}) foreach(KERNEL ${ALLKERNELS}) add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp) target_link_libraries(clblast_tuner_${KERNEL} ${API_LIBRARIES}) target_include_directories(clblast_tuner_${KERNEL} PUBLIC $ ${API_INCLUDE_DIRS}) install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin) endforeach() if(OPENCL) foreach(ROUTINE_TUNER ${ROUTINE_TUNERS}) add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp test/test_utilities.cpp) target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast) target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $ ${API_INCLUDE_DIRS} ${clblast_SOURCE_DIR}) install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin) endforeach() endif() # Adds 'alltuners' target: runs all tuners for all precisions set(ALLTUNERS ) set(ALLTUNERSDEPENDS ) foreach(KERNEL ${KERNELS}) foreach(PRECISION ${PRECISIONS}) set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_${KERNEL} -precision ${PRECISION}) endforeach() set(ALLTUNERSDEPENDS clblast_tuner_${KERNEL}) endforeach() if(OPENCL) foreach(ROUTINE_TUNER ${ROUTINE_TUNERS}) foreach(PRECISION ${PRECISIONS}) set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_routine_${ROUTINE_TUNER} -precision ${PRECISION}) endforeach() set(ALLTUNERSDEPENDS clblast_tuner_routine_${ROUTINE_TUNER}) endforeach() endif() add_custom_target(alltuners ${ALLTUNERS} DEPENDS ${ALLTUNERSDEPENDS}) endif() # ================================================================================================== # Section for the tests: common part for both performance ('CLIENTS') and correctness ('TESTS') if(CLIENTS OR TESTS) # Sets the specifics for the reference BLAS libraries set(REF_INCLUDES ) set(REF_LIBRARIES ) if(CLBLAS_FOUND) find_package(Threads) set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS}) set(WRAPPERS ${WRAPPERS} test/wrapper_clblas.hpp) if(MSVC) add_definitions(" /DCLBLAST_REF_CLBLAS") else() add_definitions(" -DCLBLAST_REF_CLBLAS") endif() endif() if(CBLAS_FOUND OR MKL_FOUND) if(MKL_FOUND) # prefers MKL over another CBLAS implementation set(REF_INCLUDES ${REF_INCLUDES} ${MKL_INCLUDE_DIRS}) set(REF_LIBRARIES ${REF_LIBRARIES} ${MKL_LIBRARIES}) if(MSVC) add_definitions(" /DCLBLAST_REF_CBLAS_MKL") else() add_definitions(" -DCLBLAST_REF_CBLAS_MKL") endif() else() set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS}) set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES}) endif() set(WRAPPERS ${WRAPPERS} test/wrapper_cblas.hpp) if(MSVC) add_definitions(" /DCLBLAST_REF_CBLAS") else() add_definitions(" -DCLBLAST_REF_CBLAS") endif() endif() if(CUBLAS_FOUND) set(REF_INCLUDES ${REF_INCLUDES} ${CUDA_INCLUDE_DIRS} ${CUBLAS_INCLUDE_DIRS}) set(REF_LIBRARIES ${REF_LIBRARIES} ${CUDA_LIBRARIES} ${CUBLAS_LIBRARIES}) set(WRAPPERS ${WRAPPERS} test/wrapper_cuda.hpp test/wrapper_cublas.hpp) if(MSVC) add_definitions(" /DCLBLAST_REF_CUBLAS") else() add_definitions(" -DCLBLAST_REF_CUBLAS") endif() endif() endif() # ================================================================================================== # Section for the performance tests (i.e. the client). These compare against optionally a reference # library, either clBLAS, a CPU BLAS, or CUDA's cuBLAS. if(CLIENTS) set(CLIENTS_COMMON ${WRAPPERS} test/test_utilities.hpp test/performance/client.hpp test/routines/common.hpp) # Visual Studio requires the sources of non-exported objects/libraries if(MSVC) set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities/utilities.cpp test/test_utilities.cpp test/performance/client.cpp) else() # Creates the common performance-tests objects (requires CMake 2.8.8) add_library(test_performance_common OBJECT test/test_utilities.cpp test/performance/client.cpp) # Adds CLBlast's interface include paths because we can't link to CLBlast here target_include_directories(test_performance_common PRIVATE $ ${clblast_SOURCE_DIR} ${REF_INCLUDES}) set(CLIENTS_COMMON ${CLIENTS_COMMON} $) endif() # Compiles the performance-tests foreach(ROUTINE ${LEVEL1_ROUTINES}) add_executable(clblast_client_${ROUTINE} ${CLIENTS_COMMON} test/performance/routines/level1/${ROUTINE}.cpp test/routines/level1/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${LEVEL2_ROUTINES}) add_executable(clblast_client_${ROUTINE} ${CLIENTS_COMMON} test/performance/routines/level2/${ROUTINE}.cpp test/routines/level2/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${LEVEL3_ROUTINES}) add_executable(clblast_client_${ROUTINE} ${CLIENTS_COMMON} test/performance/routines/level3/${ROUTINE}.cpp test/routines/level3/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${LEVELX_ROUTINES}) add_executable(clblast_client_${ROUTINE} ${CLIENTS_COMMON} test/performance/routines/levelx/${ROUTINE}.cpp test/routines/levelx/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${ROUTINES}) target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_client_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) install(TARGETS clblast_client_${ROUTINE} DESTINATION bin) endforeach() endif() # ================================================================================================== # Section for the correctness tests. Note that these tests require the presence of clBLAS and/or a # CPU BLAS library, and/or cuBLAS to act as a reference. if(TESTS) enable_testing() set(TESTS_COMMON ${WRAPPERS} test/test_utilities.hpp test/correctness/testblas.hpp test/correctness/tester.hpp test/routines/common.hpp) # Visual Studio requires the sources of non-exported objects/libraries if(MSVC) set(TESTS_COMMON ${TESTS_COMMON} src/utilities/utilities.cpp test/test_utilities.cpp test/correctness/tester.cpp test/correctness/testblas.cpp) else() # Creates the common correctness-tests objects (requires CMake 2.8.8) add_library(test_correctness_common OBJECT test/test_utilities.cpp test/correctness/tester.cpp test/correctness/testblas.cpp) target_include_directories(test_correctness_common PUBLIC $ ${clblast_SOURCE_DIR} ${REF_INCLUDES}) set(TESTS_COMMON ${TESTS_COMMON} $) endif() # Compiles the correctness-tests foreach(ROUTINE ${LEVEL1_ROUTINES}) add_executable(clblast_test_${ROUTINE} ${TESTS_COMMON} test/correctness/routines/level1/${ROUTINE}.cpp test/routines/level1/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${LEVEL2_ROUTINES}) add_executable(clblast_test_${ROUTINE} ${TESTS_COMMON} test/correctness/routines/level2/${ROUTINE}.cpp test/routines/level2/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${LEVEL3_ROUTINES}) add_executable(clblast_test_${ROUTINE} ${TESTS_COMMON} test/correctness/routines/level3/${ROUTINE}.cpp test/routines/level3/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${LEVELX_ROUTINES}) add_executable(clblast_test_${ROUTINE} ${TESTS_COMMON} test/correctness/routines/levelx/${ROUTINE}.cpp test/routines/levelx/${ROUTINE}.hpp) endforeach() foreach(ROUTINE ${ROUTINES}) target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${API_LIBRARIES}) install(TARGETS clblast_test_${ROUTINE} DESTINATION bin) target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE}) endforeach() # Miscellaneous tests set(MISC_TESTS override_parameters retrieve_parameters) if(NOT CUDA) set(MISC_TESTS ${MISC_TESTS} preprocessor) endif() if(MSVC) set(TESTS_COMMON ${TESTS_COMMON} src/kernel_preprocessor.cpp src/utilities/compile.cpp) endif() foreach(MISC_TEST ${MISC_TESTS}) add_executable(clblast_test_${MISC_TEST} ${TESTS_COMMON} test/correctness/misc/${MISC_TEST}.cpp) target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_test_${MISC_TEST} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) add_test(clblast_test_${MISC_TEST} clblast_test_${MISC_TEST}) endforeach() # CLBlast diagnostics add_executable(clblast_test_diagnostics ${TESTS_COMMON} test/diagnostics.cpp) target_link_libraries(clblast_test_diagnostics clblast ${REF_LIBRARIES} ${API_LIBRARIES}) target_include_directories(clblast_test_diagnostics PUBLIC $ ${clblast_SOURCE_DIR} ${REF_INCLUDES}) # Adds 'alltests' target: runs all tests set(ALLTESTS ) set(ALLTESTSDEPENDS ) foreach(ROUTINE ${ROUTINES}) set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE}) set(ALLTESTSDEPENDS clblast_test_${ROUTINE}) endforeach() add_custom_target(alltests ${ALLTESTS} DEPENDS ${ALLTESTSDEPENDS}) endif() # ================================================================================================== CLBlast-1.6.3/CONTRIBUTING.md000066400000000000000000000020271463263031500152420ustar00rootroot00000000000000 CLBlast: Contributing guidelines ================ For information about the CLBlast library, see the [README](README.md) file instead. Tuning results ------------- A [dedicated GitHub issue](https://github.com/CNugteren/CLBlast/issues/1) is available to post new tuning results. If you compiled with the tuners (see the [README](README.md) for instructions), ran one of the tuners on your device (or all perhaps?), and feel that these results should be included in the next release of CLBlast, please post them there. You can do this by attaching the JSON files to the issue (archived in a .ZIP file). Code improvements and additions ------------- Pull requests are welcome as long as they: * Contain unit additions or modifications * Follow the CLBlast coding style, which is loosely based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers. We use a tab-size of 2 spaces and a max-width of 100 characters. * Are made against the `master` branch. CLBlast-1.6.3/LICENSE000066400000000000000000000261211463263031500140170ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2015 Cedric Nugteren Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. CLBlast-1.6.3/README.md000066400000000000000000000236111463263031500142720ustar00rootroot00000000000000 CLBlast: The tuned OpenCL BLAS library ================ | Platform | Build status | |-----|-----| | Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | | Linux/macOS | [![Build Status](https://github.com/cnugteren/clblast/actions/workflows/build_and_test.yml/badge.svg?branch=master)](https://github.com/CNugteren/CLBlast/actions/workflows/build_and_test.yml) | | Test machine (thanks to [ArrayFire](https://ci.arrayfire.org:8010/#/builders)) | Test status | |-----|-----| | clblast-linux-nvidia-a100 | [![Test Status](http://ci.arrayfire.org:8010/badges/clblast-linux-nvidia-a100.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-nvidia-a100) | | clblast-linux-nvidia-k80 | [![Test Status](http://ci.arrayfire.org:8010/badges/clblast-linux-nvidia-k80.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-nvidia-k80) | | clblast-linux-nvidia-p100 | [![Test Status](http://ci.arrayfire.org:8010/badges/clblast-linux-nvidia-p100.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-nvidia-p100) | | clblast-linux-nvidia-t4 | [![Test Status](http://ci.arrayfire.org:8010/badges/clblast-linux-nvidia-t4.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-nvidia-t4) | | clblast-linux-nvidia-v100 | [![Test Status](http://ci.arrayfire.org:8010/badges/clblast-linux-nvidia-v100.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-nvidia-v100) | | clblast-windows-amd-r9 | [![Test Status](http://ci.arrayfire.org:8010/badges/clblast-windows-amd-r9.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-amd-r9) | | clblast-windows-nvidia-m6000 | [![Test Status](http://ci.arrayfire.org:8010/badges/clblast-windows-nvidia-m6000.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-nvidia-m6000) | CLBlast is a lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices. See [the CLBlast website](https://cnugteren.github.io/clblast) for performance reports on some devices. The library is not tuned for all possible OpenCL devices: __if out-of-the-box performance is poor, please run the tuners first__. See [the docs for a list of already tuned devices](doc/tuning.md#already-tuned-for-devices) and [instructions on how to tune yourself](doc/tuning.md) and contribute to future releases of the CLBlast library. Why CLBlast and not clBLAS or cuBLAS? ------------- Use CLBlast instead of clBLAS: * When you care about achieving maximum performance. * When you want to be able to inspect the BLAS kernels or easily customize them to your needs. * When you run on exotic OpenCL devices for which you need to tune yourself. * When you are still running on OpenCL 1.1 hardware. * When you prefer a C++ API over a C API (C API also available in CLBlast). * When you value an organized and modern C++ codebase. * When you target Intel CPUs and GPUs or embedded devices. * When you can benefit from the increased performance of half-precision fp16 data-types. Use CLBlast instead of cuBLAS: * When you want your code to run on devices other than NVIDIA CUDA-enabled GPUs. * When you want to tune for a specific configuration (e.g. rectangular matrix-sizes). * When you sleep better if you know that the library you use is open-source. * When you are using OpenCL rather than CUDA. When not to use CLBlast: * When you run on NVIDIA's CUDA-enabled GPUs only and can benefit from cuBLAS's assembly-level tuned kernels. Getting started ------------- CLBlast can be compiled with minimal dependencies (apart from OpenCL) in the usual CMake-way, e.g.: mkdir build && cd build cmake .. make Detailed instructions for various platforms can be found are [here](doc/installation.md). Like clBLAS and cuBLAS, CLBlast also requires OpenCL device buffers as arguments to its routines. This means you'll have full control over the OpenCL buffers and the host-device memory transfers. CLBlast's API is designed to resemble clBLAS's C API as much as possible, requiring little integration effort in case clBLAS was previously used. Using CLBlast starts by including the C++ header: #include Or alternatively the plain C version: #include Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the above mentioned include files and the included [API documentation](doc/api.md). The API is kept as close as possible to the Netlib BLAS and the cuBLAS/clBLAS APIs. For an overview of the supported routines, see [here](doc/routines.md). To get started quickly, a couple of stand-alone example programs are included in the `samples` subfolder. They can optionally be compiled using the CMake infrastructure of CLBlast by providing the `-DSAMPLES=ON` flag, for example as follows: cmake -DSAMPLES=ON .. Afterwards, you can optionally read more about running proper [benchmarks](doc/benchmarking.md) and [tuning the library](doc/tuning.md). Full documentation ------------- More detailed documentation is available in separate files: * [Building and installing](doc/installation.md) * [Supported routines overview](doc/routines.md) * [Performance measuring and benchmarking](doc/benchmarking.md) * [Tuning for better performance](doc/tuning.md) * [Testing the library for correctness](doc/testing.md) * [Bindings / wrappers for other languages](doc/bindings.md) * [More details on the GEMM kernel](doc/details_gemm.md) * [More details on the convolution implementation](doc/details_conv.md) * [Glossary with some terms explained](doc/glossary.md) * [Frequently asked questions (FAQ) and their answers](doc/faq.md) Known issues ------------- Known issues: * Correctness issues on Intel Arc A770 and several other devices with version 1.6.2 or lower (depends on the device). Upgrade to version 1.6.3 or newer. * Routines returning an integer are currently not properly tested for half-precision FP16: IHAMAX/IHAMIN/IHMAX/IHMIN * Half-precision FP16 tests might sometimes fail based on order multiplication, i.e. (a * b) * c != (c * b) * a * The AMD APP SDK has a bug causing a conflict with libstdc++, resulting in a segfault when initialising static variables. This has been reported to occur with the CLBlast tuners. * The AMD run-time compiler has a bug causing it to get stuck in an infinite loop. This is reported to happen occasionally when tuning the CLBlast GEMM routine. * AMD Southern Island GPUs might cause wrong results with the amdgpu-pro drivers. Do configure CMake with `AMD_SI_EMPTY_KERNEL_WORKAROUND` to resolve the issue, [see issue #301](https://github.com/CNugteren/CLBlast/issues/301). * Tests might fail on an Intel IvyBridge GPU with the latest Beignet. Please downgrade Beignet to 1.2.1, [see issue #231](https://github.com/CNugteren/CLBlast/issues/231). Contributing ------------- Contributions are welcome in the form of tuning results for OpenCL devices previously untested or pull requests. See [the contributing guidelines](CONTRIBUTING.md) for more details. The main contributing authors (code, pull requests, testing) can be found in the list of[GitHub contributors](https://github.com/CNugteren/CLBlast/graphs/contributors). Tuning and testing on a variety of OpenCL devices was made possible by: * [TU/e ES research group](http://www.es.ele.tue.nl/) * [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/) * [dividiti](http://www.dividiti.com) * [SURFsara HPC center](http://www.surfsara.com) * [ArrayFire](http://arrayfire.org) * [TomTom](http://www.tomtom.com) * Everyone reporting [tuning results](https://github.com/CNugteren/CLBlast/issues/1) Hardware/software for this project was contributed by: * [HPC research group at the University of Bristol](http://uob-hpc.github.io/zoo/) for access to their GPU zoo * [ArrayFire](http://arrayfire.org) for settings up and supporting Buildbot correctness tests on multiple platforms * [JetBrains](https://www.jetbrains.com/clion/) for supply a free CLion IDE license for CLBlast developers * [Travis CI](https://travis-ci.org/CNugteren/CLBlast/branches) and [AppVeyor](https://ci.appveyor.com/project/CNugteren/clblast) for free automated build tests for open-source projects More information ------------- Further information on CLBlast is available through the following links: * A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slide-set is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf). An updated version was also presented at IWOCL in May 2018. The slide set can be found [here as PDF](https://cnugteren.github.io/downloads/CLBlastIWOCL18.pdf). * More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (v1 May 2017, updated to v2 in April 2018). For CLTune, the inspiration for the included auto-tuner, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper. How to cite this work: Cedric Nugteren. CLBlast: A Tuned OpenCL BLAS Library. In IWOCL'18: International Workshop on OpenCL. ACM, New York, NY, USA, 10 pages. 2018. https://doi.org/10.1145/3204919.3204924 Support us ------------- This project started in March 2015 as an evenings and weekends free-time project next to a full-time job for Cedric Nugteren. You can find contact information on the [website of the main author](http://cnugteren.github.io). CLBlast-1.6.3/clblast.pc.in000066400000000000000000000006201463263031500153630ustar00rootroot00000000000000prefix=@CMAKE_INSTALL_PREFIX@ exec_prefix=${prefix} includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ Name: CLBlast Description: CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11 Version: @clblast_VERSION_MAJOR@.@clblast_VERSION_MINOR@.@clblast_VERSION_PATCH@ Libs: -L${libdir} -lclblast Cflags: -I${includedir} CLBlast-1.6.3/cmake/000077500000000000000000000000001463263031500140705ustar00rootroot00000000000000CLBlast-1.6.3/cmake/Modules/000077500000000000000000000000001463263031500155005ustar00rootroot00000000000000CLBlast-1.6.3/cmake/Modules/FindCBLAS.cmake000066400000000000000000000053671463263031500201420ustar00rootroot00000000000000 # ================================================================================================== # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This # project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- # width of 100 characters per line. # # Author(s): # Cedric Nugteren # # ================================================================================================== # # Defines the following variables: # CBLAS_FOUND Boolean holding whether or not the Netlib BLAS library was found # CBLAS_INCLUDE_DIRS The Netlib BLAS include directory # CBLAS_LIBRARIES The Netlib BLAS library # # In case BLAS is not installed in the default directory, set the CBLAS_ROOT variable to point to # the root of BLAS, such that 'cblas.h' can be found in $CBLAS_ROOT/include. This can either be # done using an environmental variable (e.g. export CBLAS_ROOT=/path/to/BLAS) or using a CMake # variable (e.g. cmake -DCBLAS_ROOT=/path/to/BLAS ..). # # ================================================================================================== # Sets the possible install locations set(CBLAS_HINTS ${CBLAS_ROOT} $ENV{CBLAS_ROOT} ) set(CBLAS_PATHS /usr /usr/local /usr/local/opt /System/Library/Frameworks ) # Finds the include directories find_path(CBLAS_INCLUDE_DIRS NAMES cblas.h HINTS ${CBLAS_HINTS} PATH_SUFFIXES include inc include/x86_64 include/x64 openblas openblas/include include/blis blis/include blis/include/blis Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers PATHS ${CBLAS_PATHS} DOC "Netlib BLAS include header cblas.h" ) mark_as_advanced(CBLAS_INCLUDE_DIRS) # Finds the library find_library(CBLAS_LIBRARIES NAMES cblas blas blis openblas libopenblas accelerate HINTS ${CBLAS_HINTS} PATH_SUFFIXES lib lib64 bin lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import openblas/bin openblas/lib blis/lib lib/atlas-base PATHS ${CBLAS_PATHS} DOC "Netlib BLAS library" ) mark_as_advanced(CBLAS_LIBRARIES) # ================================================================================================== # Notification messages if(NOT CBLAS_INCLUDE_DIRS) message(STATUS "Could NOT find 'cblas.h', install a CPU Netlib BLAS or set CBLAS_ROOT") endif() if(NOT CBLAS_LIBRARIES) message(STATUS "Could NOT find a CPU Netlib BLAS library, install it or set CBLAS_ROOT") endif() # Determines whether or not BLAS was found include(FindPackageHandleStandardArgs) find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIRS CBLAS_LIBRARIES) # ================================================================================================== CLBlast-1.6.3/cmake/Modules/FindMKL.cmake000066400000000000000000000065141463263031500177340ustar00rootroot00000000000000 # ================================================================================================== # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This # project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- # width of 100 characters per line. # # Author(s): # Cedric Nugteren # # ================================================================================================== # # Defines the following variables: # MKL_FOUND Boolean holding whether or not the Intel MKL BLAS library was found # MKL_INCLUDE_DIRS The Intel MKL BLAS include directory # MKL_LIBRARIES The Intel MKL BLAS library # # In case MKL is not installed in the default directory, set the MKL_ROOT variable to point to # the root of MKL, such that 'mkl_cblas.h' can be found in $MKL_ROOT/include. This can either be # done using an environmental variable (e.g. export MKL_ROOT=/path/to/MKL) or using a CMake # variable (e.g. cmake -DMKL_ROOT=/path/to/MKL ..). # # ================================================================================================== # Sets the possible install locations set(MKL_HINTS ${MKL_ROOT} $ENV{MKL_ROOT} $ENV{MKLROOT} $ENV{CMPLR_ROOT} ) set(MKL_PATHS /usr /usr/local /usr/local/opt /usr/local/mkl /opt/intel /opt/intel/mkl ) # Finds the include directories find_path(MKL_INCLUDE_DIRS NAMES mkl_cblas.h HINTS ${MKL_HINTS} PATH_SUFFIXES include inc include/x86_64 include/x64 PATHS ${MKL_PATHS} DOC "Intel MKL CBLAS include header mkl_cblas.h" ) mark_as_advanced(MKL_INCLUDE_DIRS) # Finds the libraries set(MKL_LIB_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import lib/intel64 linux/compiler/lib/intel64 windows/compiler/lib/intel64) find_library(MKL_LIBRARIES_LP64 NAMES mkl_intel_lp64 HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel MKL lp64 library") find_library(MKL_LIBRARIES_THREAD NAMES mkl_intel_thread HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel MKL thread library") find_library(MKL_LIBRARIES_CORE NAMES mkl_core HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel MKL core library") find_library(MKL_LIBRARIES_OMP NAMES iomp5 libiomp5md HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel OpenMP library") set(MKL_LIBRARIES ${MKL_LIBRARIES_LP64} ${MKL_LIBRARIES_THREAD} ${MKL_LIBRARIES_CORE} ${MKL_LIBRARIES_OMP}) mark_as_advanced(MKL_LIBRARIES) # ================================================================================================== # Notification messages if(NOT MKL_INCLUDE_DIRS) message(STATUS "Could NOT find 'mkl_cblas.h', install it or set MKLROOT and CMPLR_ROOT or source setvars.sh or setvars.bat") endif() if(NOT MKL_LIBRARIES) message(STATUS "Could NOT find the Intel MKL BLAS library, install it or set MKLROOT and CMPLR_ROOT or source setvars.sh or setvars.bat") endif() # Determines whether or not MKL was found include(FindPackageHandleStandardArgs) find_package_handle_standard_args(MKL DEFAULT_MSG MKL_INCLUDE_DIRS MKL_LIBRARIES_LP64 MKL_LIBRARIES_THREAD MKL_LIBRARIES_CORE MKL_LIBRARIES_OMP) # ================================================================================================== CLBlast-1.6.3/cmake/Modules/FindOpenCL.cmake000066400000000000000000000053171463263031500204310ustar00rootroot00000000000000 # ================================================================================================== # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This # project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- # width of 100 characters per line. # # Author(s): # Cedric Nugteren # # ================================================================================================== # # Defines the following variables: # OPENCL_FOUND Boolean holding whether or not the OpenCL library was found # OPENCL_INCLUDE_DIRS The OpenCL include directory # OPENCL_LIBRARIES The OpenCL library # # In case OpenCL is not installed in the default directory, set the OPENCL_ROOT variable to point to # the root of OpenCL, such that 'OpenCL/cl.h' or 'CL/cl.h' can be found in $OPENCL_ROOT/include. # This can either be done using an environmental variable (e.g. export OPENCL_ROOT=/path/to/opencl) # or using a CMake variable (e.g. cmake -DOPENCL_ROOT=/path/to/opencl ..). # # ================================================================================================== # Sets the possible install locations set(OPENCL_HINTS ${OPENCL_ROOT} $ENV{OPENCL_ROOT} $ENV{OCL_ROOT} $ENV{AMDAPPSDKROOT} $ENV{CUDA_PATH} $ENV{INTELOCLSDKROOT} $ENV{NVSDKCOMPUTE_ROOT} $ENV{ATISTREAMSDKROOT} ) set(OPENCL_PATHS /usr/local/cuda /opt/cuda /opt/intel/opencl /usr /usr/local /opt/rocm/opencl ) # Finds the include directories find_path(OPENCL_INCLUDE_DIRS NAMES OpenCL/cl.h CL/cl.h HINTS ${OPENCL_HINTS} PATH_SUFFIXES include OpenCL/common/inc inc include/x86_64 include/x64 PATHS ${OPENCL_PATHS} DOC "OpenCL include header OpenCL/cl.h or CL/cl.h" ) mark_as_advanced(OPENCL_INCLUDE_DIRS) # Finds the library find_library(OPENCL_LIBRARIES NAMES OpenCL HINTS ${OPENCL_HINTS} PATH_SUFFIXES lib lib64 lib/x86_64 lib/x86_64/sdk lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64 PATHS ${OPENCL_PATHS} DOC "OpenCL library" ) mark_as_advanced(OPENCL_LIBRARIES) # ================================================================================================== # Notification messages if(NOT OPENCL_INCLUDE_DIRS) message(STATUS "Could NOT find 'OpenCL/cl.h' or 'CL/cl.h', install OpenCL or set OPENCL_ROOT") endif() if(NOT OPENCL_LIBRARIES) message(STATUS "Could NOT find OpenCL library, install it or set OPENCL_ROOT") endif() # Determines whether or not OpenCL was found include(FindPackageHandleStandardArgs) find_package_handle_standard_args(OpenCL DEFAULT_MSG OPENCL_INCLUDE_DIRS OPENCL_LIBRARIES) # ================================================================================================== CLBlast-1.6.3/cmake/Modules/FindclBLAS.cmake000066400000000000000000000046561463263031500203560ustar00rootroot00000000000000 # ================================================================================================== # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This # project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- # width of 100 characters per line. # # Author(s): # Cedric Nugteren # # ================================================================================================== # # Defines the following variables: # CLBLAS_FOUND Boolean holding whether or not the clBLAS library was found # CLBLAS_INCLUDE_DIRS The clBLAS include directory # CLBLAS_LIBRARIES The clBLAS library # # In case clBLAS is not installed in the default directory, set the CLBLAS_ROOT variable to point to # the root of clBLAS, such that 'clBLAS.h' can be found in $CLBLAS_ROOT/include. This can either be # done using an environmental variable (e.g. export CLBLAS_ROOT=/path/to/clBLAS) or using a CMake # variable (e.g. cmake -DCLBLAS_ROOT=/path/to/clBLAS ..). # # ================================================================================================== # Sets the possible install locations set(CLBLAS_HINTS ${CLBLAS_ROOT} $ENV{CLBLAS_ROOT} ) set(CLBLAS_PATHS /usr /usr/local ) # Finds the include directories find_path(CLBLAS_INCLUDE_DIRS NAMES clBLAS.h HINTS ${CLBLAS_HINTS} PATH_SUFFIXES include inc include/x86_64 include/x64 PATHS ${CLBLAS_PATHS} DOC "clBLAS include header clBLAS.h" ) mark_as_advanced(CLBLAS_INCLUDE_DIRS) # Finds the library find_library(CLBLAS_LIBRARIES NAMES clBLAS HINTS ${CLBLAS_HINTS} PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import PATHS ${CLBLAS_PATHS} DOC "clBLAS library" ) mark_as_advanced(CLBLAS_LIBRARIES) # ================================================================================================== # Notification messages if(NOT CLBLAS_INCLUDE_DIRS) message(STATUS "Could NOT find 'clBLAS.h', install clBLAS or set CLBLAS_ROOT") endif() if(NOT CLBLAS_LIBRARIES) message(STATUS "Could NOT find clBLAS library, install it or set CLBLAS_ROOT") endif() # Determines whether or not clBLAS was found include(FindPackageHandleStandardArgs) find_package_handle_standard_args(clBLAS DEFAULT_MSG CLBLAS_INCLUDE_DIRS CLBLAS_LIBRARIES) # ================================================================================================== CLBlast-1.6.3/cmake/Modules/FindcuBLAS.cmake000066400000000000000000000055761463263031500203710ustar00rootroot00000000000000 # ================================================================================================== # This file is part of the cuBLASt project. The project is licensed under Apache Version 2.0. This # project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- # width of 100 characters per line. # # Author(s): # Cedric Nugteren # # ================================================================================================== # # Defines the following variables: # CUBLAS_FOUND Boolean holding whether or not the cuBLAS library was found # CUBLAS_INCLUDE_DIRS The CUDA and cuBLAS include directory # CUDA_LIBRARIES The CUDA library # CUBLAS_LIBRARIES The cuBLAS library # # In case CUDA is not installed in the default directory, set the CUDA_ROOT variable to point to # the root of cuBLAS, such that 'cublas_v2.h' can be found in $CUDA_ROOT/include. This can either be # done using an environmental variable (e.g. export CUDA_ROOT=/path/to/cuBLAS) or using a CMake # variable (e.g. cmake -DCUDA_ROOT=/path/to/cuBLAS ..). # # ================================================================================================== # Sets the possible install locations set(CUBLAS_HINTS ${CUDA_ROOT} $ENV{CUDA_ROOT} $ENV{CUDA_TOOLKIT_ROOT_DIR} ) set(CUBLAS_PATHS /usr /usr/local /usr/local/cuda ) # Finds the include directories find_path(CUBLAS_INCLUDE_DIRS NAMES cublas_v2.h cuda.h HINTS ${CUBLAS_HINTS} PATH_SUFFIXES include inc include/x86_64 include/x64 PATHS ${CUBLAS_PATHS} DOC "cuBLAS include header cublas_v2.h" ) mark_as_advanced(CUBLAS_INCLUDE_DIRS) # Finds the libraries find_library(CUDA_LIBRARIES NAMES cudart HINTS ${CUBLAS_HINTS} PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import PATHS ${CUBLAS_PATHS} DOC "CUDA library" ) mark_as_advanced(CUDA_LIBRARIES) find_library(CUBLAS_LIBRARIES NAMES cublas HINTS ${CUBLAS_HINTS} PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import PATHS ${CUBLAS_PATHS} DOC "cuBLAS library" ) mark_as_advanced(CUBLAS_LIBRARIES) # ================================================================================================== # Notification messages if(NOT CUBLAS_INCLUDE_DIRS) message(STATUS "Could NOT find 'cuBLAS.h', install CUDA/cuBLAS or set CUDA_ROOT") endif() if(NOT CUDA_LIBRARIES) message(STATUS "Could NOT find CUDA library, install it or set CUDA_ROOT") endif() if(NOT CUBLAS_LIBRARIES) message(STATUS "Could NOT find cuBLAS library, install it or set CUDA_ROOT") endif() # Determines whether or not cuBLAS was found include(FindPackageHandleStandardArgs) find_package_handle_standard_args(cuBLAS DEFAULT_MSG CUBLAS_INCLUDE_DIRS CUDA_LIBRARIES CUBLAS_LIBRARIES) # ================================================================================================== CLBlast-1.6.3/cmake/c_flag_overrides.cmake000066400000000000000000000007031463263031500203670ustar00rootroot00000000000000# Overriding the CMake flags to use static runtime libraries # See http://www.cmake.org/Wiki/CMake_FAQ#How_can_I_build_my_MSVC_application_with_a_static_runtime.3F if(MSVC) set(CMAKE_C_FLAGS_DEBUG_INIT "/D_DEBUG /MTd /Zi /Ob0 /Od /RTC1") set(CMAKE_C_FLAGS_MINSIZEREL_INIT "/MT /O1 /Ob1 /D NDEBUG") set(CMAKE_C_FLAGS_RELEASE_INIT "/MT /O2 /Ob2 /D NDEBUG") set(CMAKE_C_FLAGS_RELWITHDEBINFO_INIT "/MT /Zi /O2 /Ob1 /D NDEBUG") endif() CLBlast-1.6.3/cmake/cxx_flag_overrides.cmake000066400000000000000000000007131463263031500207500ustar00rootroot00000000000000# Overriding the CMake flags to use static runtime libraries # See http://www.cmake.org/Wiki/CMake_FAQ#How_can_I_build_my_MSVC_application_with_a_static_runtime.3F if(MSVC) set(CMAKE_CXX_FLAGS_DEBUG_INIT "/D_DEBUG /MTd /Zi /Ob0 /Od /RTC1") set(CMAKE_CXX_FLAGS_MINSIZEREL_INIT "/MT /O1 /Ob1 /D NDEBUG") set(CMAKE_CXX_FLAGS_RELEASE_INIT "/MT /O2 /Ob2 /D NDEBUG") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO_INIT "/MT /Zi /O2 /Ob1 /D NDEBUG") endif() CLBlast-1.6.3/doc/000077500000000000000000000000001463263031500135555ustar00rootroot00000000000000CLBlast-1.6.3/doc/api.md000066400000000000000000007030431463263031500146570ustar00rootroot00000000000000CLBlast: API reference ================ xSWAP: Swap two vectors ------------- Interchanges _n_ elements of vectors _x_ and _y_. C++ API: ``` template StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to SWAP: * `const size_t n`: Integer size argument. This value must be positive. * `cl_mem x_buffer`: OpenCL buffer to store the output x vector. * `const size_t x_offset`: The offset in elements from the start of the output x vector. * `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0. * `cl_mem y_buffer`: OpenCL buffer to store the output y vector. * `const size_t y_offset`: The offset in elements from the start of the output y vector. * `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xSCAL: Vector scaling ------------- Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_. C++ API: ``` template StatusCode Scal(const size_t n, const T alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSscal(const size_t n, const float alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDscal(const size_t n, const double alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCscal(const size_t n, const cl_float2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZscal(const size_t n, const cl_double2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHscal(const size_t n, const cl_half alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to SCAL: * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `cl_mem x_buffer`: OpenCL buffer to store the output x vector. * `const size_t x_offset`: The offset in elements from the start of the output x vector. * `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xCOPY: Vector copy ------------- Copies the contents of vector _x_ into vector _y_. C++ API: ``` template StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastScopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to COPY: * `const size_t n`: Integer size argument. This value must be positive. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_mem y_buffer`: OpenCL buffer to store the output y vector. * `const size_t y_offset`: The offset in elements from the start of the output y vector. * `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xAXPY: Vector-times-constant plus vector ------------- Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant. C++ API: ``` template StatusCode Axpy(const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSaxpy(const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDaxpy(const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCaxpy(const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZaxpy(const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHaxpy(const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to AXPY: * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_mem y_buffer`: OpenCL buffer to store the output y vector. * `const size_t y_offset`: The offset in elements from the start of the output y vector. * `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xDOT: Dot product of two vectors ------------- Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer. C++ API: ``` template StatusCode Dot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to DOT: * `const size_t n`: Integer size argument. This value must be positive. * `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector. * `const size_t dot_offset`: The offset in elements from the start of the output dot vector. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. * `const size_t y_offset`: The offset in elements from the start of the input y vector. * `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xDOTU: Dot product of two complex vectors ------------- See the regular xDOT routine. C++ API: ``` template StatusCode Dotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastCdotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZdotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to DOTU: * `const size_t n`: Integer size argument. This value must be positive. * `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector. * `const size_t dot_offset`: The offset in elements from the start of the output dot vector. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. * `const size_t y_offset`: The offset in elements from the start of the input y vector. * `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xDOTC: Dot product of two complex vectors, one conjugated ------------- See the regular xDOT routine. C++ API: ``` template StatusCode Dotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastCdotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZdotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to DOTC: * `const size_t n`: Integer size argument. This value must be positive. * `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector. * `const size_t dot_offset`: The offset in elements from the start of the output dot vector. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. * `const size_t y_offset`: The offset in elements from the start of the input y vector. * `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xNRM2: Euclidian norm of a vector ------------- Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer. C++ API: ``` template StatusCode Nrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastScnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDznrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to NRM2: * `const size_t n`: Integer size argument. This value must be positive. * `cl_mem nrm2_buffer`: OpenCL buffer to store the output nrm2 vector. * `const size_t nrm2_offset`: The offset in elements from the start of the output nrm2 vector. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xASUM: Absolute sum of values in a vector ------------- Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer. C++ API: ``` template StatusCode Asum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastScasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDzasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to ASUM: * `const size_t n`: Integer size argument. This value must be positive. * `cl_mem asum_buffer`: OpenCL buffer to store the output asum vector. * `const size_t asum_offset`: The offset in elements from the start of the output asum vector. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xSUM: Sum of values in a vector (non-BLAS function) ------------- Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine. C++ API: ``` template StatusCode Sum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastScsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDzsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to SUM: * `const size_t n`: Integer size argument. This value must be positive. * `cl_mem sum_buffer`: OpenCL buffer to store the output sum vector. * `const size_t sum_offset`: The offset in elements from the start of the output sum vector. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xAMAX: Index of absolute maximum value in a vector ------------- Finds the index of a maximum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. C++ API: ``` template StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastiSamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiDamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiCamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiZamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiHamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to AMAX: * `const size_t n`: Integer size argument. This value must be positive. * `cl_mem imax_buffer`: OpenCL buffer to store the output imax vector. * `const size_t imax_offset`: The offset in elements from the start of the output imax vector. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xAMIN: Index of absolute minimum value in a vector (non-BLAS function) ------------- Finds the index of a minimum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. C++ API: ``` template StatusCode Amin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastiSamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiDamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiCamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiZamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiHamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to AMIN: * `const size_t n`: Integer size argument. This value must be positive. * `cl_mem imin_buffer`: OpenCL buffer to store the output imin vector. * `const size_t imin_offset`: The offset in elements from the start of the output imin vector. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xMAX: Index of maximum value in a vector (non-BLAS function) ------------- Finds the index of a maximum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine. C++ API: ``` template StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastiSmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiDmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiCmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiZmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiHmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to MAX: * `const size_t n`: Integer size argument. This value must be positive. * `cl_mem imax_buffer`: OpenCL buffer to store the output imax vector. * `const size_t imax_offset`: The offset in elements from the start of the output imax vector. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xMIN: Index of minimum value in a vector (non-BLAS function) ------------- Finds the index of a minimum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine. C++ API: ``` template StatusCode Min(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastiSmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiDmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiCmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiZmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiHmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to MIN: * `const size_t n`: Integer size argument. This value must be positive. * `cl_mem imin_buffer`: OpenCL buffer to store the output imin vector. * `const size_t imin_offset`: The offset in elements from the start of the output imin vector. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xGEMV: General matrix-vector multiplication ------------- Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation. C++ API: ``` template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to GEMV: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem y_buffer`: OpenCL buffer to store the output y vector. * `const size_t y_offset`: The offset in elements from the start of the output y vector. * `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for GEMV: * The value of `a_ld` must be at least `m`. xGBMV: General banded matrix-vector multiplication ------------- Same operation as xGEMV, but matrix _A_ is banded instead. C++ API: ``` template StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to GBMV: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const size_t kl`: Integer size argument. This value must be positive. * `const size_t ku`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem y_buffer`: OpenCL buffer to store the output y vector. * `const size_t y_offset`: The offset in elements from the start of the output y vector. * `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for GBMV: * The value of `a_ld` must be at least `kl + ku + 1`. xHEMV: Hermitian matrix-vector multiplication ------------- Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead. C++ API: ``` template StatusCode Hemv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastChemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to HEMV: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem y_buffer`: OpenCL buffer to store the output y vector. * `const size_t y_offset`: The offset in elements from the start of the output y vector. * `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for HEMV: * The value of `a_ld` must be at least `n`. xHBMV: Hermitian banded matrix-vector multiplication ------------- Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead. C++ API: ``` template StatusCode Hbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastChbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to HBMV: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const size_t k`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem y_buffer`: OpenCL buffer to store the output y vector. * `const size_t y_offset`: The offset in elements from the start of the output y vector. * `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for HBMV: * The value of `a_ld` must be at least `k + 1`. xHPMV: Hermitian packed matrix-vector multiplication ------------- Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_. C++ API: ``` template StatusCode Hpmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastChpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to HPMV: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem ap_buffer`: OpenCL buffer to store the input AP matrix. * `const size_t ap_offset`: The offset in elements from the start of the input AP matrix. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem y_buffer`: OpenCL buffer to store the output y vector. * `const size_t y_offset`: The offset in elements from the start of the output y vector. * `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xSYMV: Symmetric matrix-vector multiplication ------------- Same operation as xGEMV, but matrix _A_ is symmetric instead. C++ API: ``` template StatusCode Symv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to SYMV: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem y_buffer`: OpenCL buffer to store the output y vector. * `const size_t y_offset`: The offset in elements from the start of the output y vector. * `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for SYMV: * The value of `a_ld` must be at least `n`. xSBMV: Symmetric banded matrix-vector multiplication ------------- Same operation as xGEMV, but matrix _A_ is symmetric and banded instead. C++ API: ``` template StatusCode Sbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to SBMV: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const size_t k`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem y_buffer`: OpenCL buffer to store the output y vector. * `const size_t y_offset`: The offset in elements from the start of the output y vector. * `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for SBMV: * The value of `a_ld` must be at least `k + 1`. xSPMV: Symmetric packed matrix-vector multiplication ------------- Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_. C++ API: ``` template StatusCode Spmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to SPMV: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem ap_buffer`: OpenCL buffer to store the input AP matrix. * `const size_t ap_offset`: The offset in elements from the start of the input AP matrix. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem y_buffer`: OpenCL buffer to store the output y vector. * `const size_t y_offset`: The offset in elements from the start of the output y vector. * `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xTRMV: Triangular matrix-vector multiplication ------------- Same operation as xGEMV, but matrix _A_ is triangular instead. C++ API: ``` template StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastStrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to TRMV: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal. * `const size_t n`: Integer size argument. This value must be positive. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `cl_mem x_buffer`: OpenCL buffer to store the output x vector. * `const size_t x_offset`: The offset in elements from the start of the output x vector. * `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for TRMV: * The value of `a_ld` must be at least `n`. xTBMV: Triangular banded matrix-vector multiplication ------------- Same operation as xGEMV, but matrix _A_ is triangular and banded instead. C++ API: ``` template StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastStbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to TBMV: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal. * `const size_t n`: Integer size argument. This value must be positive. * `const size_t k`: Integer size argument. This value must be positive. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `cl_mem x_buffer`: OpenCL buffer to store the output x vector. * `const size_t x_offset`: The offset in elements from the start of the output x vector. * `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for TBMV: * The value of `a_ld` must be at least `k + 1`. xTPMV: Triangular packed matrix-vector multiplication ------------- Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_. C++ API: ``` template StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastStpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to TPMV: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal. * `const size_t n`: Integer size argument. This value must be positive. * `const cl_mem ap_buffer`: OpenCL buffer to store the input AP matrix. * `const size_t ap_offset`: The offset in elements from the start of the input AP matrix. * `cl_mem x_buffer`: OpenCL buffer to store the output x vector. * `const size_t x_offset`: The offset in elements from the start of the output x vector. * `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xTRSV: Solves a triangular system of equations ------------- C++ API: ``` template StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to TRSV: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal. * `const size_t n`: Integer size argument. This value must be positive. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `cl_mem x_buffer`: OpenCL buffer to store the output x vector. * `const size_t x_offset`: The offset in elements from the start of the output x vector. * `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xGER: General rank-1 matrix update ------------- Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value. C++ API: ``` template StatusCode Ger(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSger(const CLBlastLayout layout, const size_t m, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDger(const CLBlastLayout layout, const size_t m, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHger(const CLBlastLayout layout, const size_t m, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to GER: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. * `const size_t y_offset`: The offset in elements from the start of the input y vector. * `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0. * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. * `const size_t a_offset`: The offset in elements from the start of the output A matrix. * `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for GER: * The value of `a_ld` must be at least `m`. xGERU: General rank-1 complex matrix update ------------- Same operation as xGER, but with complex data-types. C++ API: ``` template StatusCode Geru(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastCgeru(const CLBlastLayout layout, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgeru(const CLBlastLayout layout, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to GERU: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. * `const size_t y_offset`: The offset in elements from the start of the input y vector. * `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0. * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. * `const size_t a_offset`: The offset in elements from the start of the output A matrix. * `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for GERU: * The value of `a_ld` must be at least `m`. xGERC: General rank-1 complex conjugated matrix update ------------- Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors. C++ API: ``` template StatusCode Gerc(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastCgerc(const CLBlastLayout layout, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgerc(const CLBlastLayout layout, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to GERC: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. * `const size_t y_offset`: The offset in elements from the start of the input y vector. * `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0. * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. * `const size_t a_offset`: The offset in elements from the start of the output A matrix. * `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for GERC: * The value of `a_ld` must be at least `m`. xHER: Hermitian rank-1 matrix update ------------- Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value. C++ API: ``` template StatusCode Her(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastCher(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZher(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to HER: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. * `const size_t a_offset`: The offset in elements from the start of the output A matrix. * `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for HER: * The value of `a_ld` must be at least `n`. xHPR: Hermitian packed rank-1 matrix update ------------- Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_. C++ API: ``` template StatusCode Hpr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastChpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) ``` Arguments to HPR: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix. * `const size_t ap_offset`: The offset in elements from the start of the output AP matrix. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xHER2: Hermitian rank-2 matrix update ------------- Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate. C++ API: ``` template StatusCode Her2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastCher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to HER2: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. * `const size_t y_offset`: The offset in elements from the start of the input y vector. * `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0. * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. * `const size_t a_offset`: The offset in elements from the start of the output A matrix. * `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for HER2: * The value of `a_ld` must be at least `n`. xHPR2: Hermitian packed rank-2 matrix update ------------- Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_. C++ API: ``` template StatusCode Hpr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastChpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) ``` Arguments to HPR2: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. * `const size_t y_offset`: The offset in elements from the start of the input y vector. * `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0. * `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix. * `const size_t ap_offset`: The offset in elements from the start of the output AP matrix. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xSYR: Symmetric rank-1 matrix update ------------- Same operation as xHER, but matrix A is a symmetric matrix instead. C++ API: ``` template StatusCode Syr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to SYR: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. * `const size_t a_offset`: The offset in elements from the start of the output A matrix. * `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for SYR: * The value of `a_ld` must be at least `n`. xSPR: Symmetric packed rank-1 matrix update ------------- Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_. C++ API: ``` template StatusCode Spr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) ``` Arguments to SPR: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix. * `const size_t ap_offset`: The offset in elements from the start of the output AP matrix. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xSYR2: Symmetric rank-2 matrix update ------------- Same operation as xHER2, but matrix _A_ is a symmetric matrix instead. C++ API: ``` template StatusCode Syr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to SYR2: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. * `const size_t y_offset`: The offset in elements from the start of the input y vector. * `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0. * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. * `const size_t a_offset`: The offset in elements from the start of the output A matrix. * `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for SYR2: * The value of `a_ld` must be at least `n`. xSPR2: Symmetric packed rank-2 matrix update ------------- Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_. C++ API: ``` template StatusCode Spr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) ``` Arguments to SPR2: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. * `const size_t y_offset`: The offset in elements from the start of the input y vector. * `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0. * `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix. * `const size_t ap_offset`: The offset in elements from the start of the output AP matrix. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xGEMM: General matrix-matrix multiplication ------------- Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation. C++ API: ``` template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer = nullptr) ``` C API: ``` CLBlastStatusCode CLBlastSgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to GEMM: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const Transpose b_transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const size_t k`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix. * `const size_t b_offset`: The offset in elements from the start of the input B matrix. * `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. * `const size_t c_offset`: The offset in elements from the start of the output C matrix. * `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for GEMM: * When `(transpose_a == Transpose::kNo && layout == Layout::kColMajor) || (transpose_a == Transpose::kYes && layout == Layout::kRowMajor)`, then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `k`. * When `(transpose_b == Transpose::kNo && layout == Layout::kColMajor) || (transpose_b == Transpose::kYes && layout == Layout::kRowMajor)`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`. * The value of `c_ld` must be at least `m`. xSYMM: Symmetric matrix-matrix multiplication ------------- Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed. C++ API: ``` template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to SYMM: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142). * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix. * `const size_t b_offset`: The offset in elements from the start of the input B matrix. * `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. * `const size_t c_offset`: The offset in elements from the start of the output C matrix. * `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for SYMM: * When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`. * The value of `b_ld` must be at least `m`. * The value of `c_ld` must be at least `m`. xHEMM: Hermitian matrix-matrix multiplication ------------- Same operation as xSYMM, but _A_ is an Hermitian matrix instead. C++ API: ``` template StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastChemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to HEMM: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142). * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix. * `const size_t b_offset`: The offset in elements from the start of the input B matrix. * `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. * `const size_t c_offset`: The offset in elements from the start of the output C matrix. * `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for HEMM: * When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`. * The value of `b_ld` must be at least `m`. * The value of `c_ld` must be at least `m`. xSYRK: Rank-K update of a symmetric matrix ------------- Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values. C++ API: ``` template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to SYRK: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const size_t n`: Integer size argument. This value must be positive. * `const size_t k`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. * `const size_t c_offset`: The offset in elements from the start of the output C matrix. * `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for SYRK: * When `(transpose == Transpose::kNo && layout == Layout::kColMajor) || (transpose == Transpose::kYes && layout == Layout::kRowMajor)`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`. * The value of `c_ld` must be at least `m`. xHERK: Rank-K update of a hermitian matrix ------------- Same operation as xSYRK, but _C_ is an Hermitian matrix instead. C++ API: ``` template StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastCherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to HERK: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const size_t n`: Integer size argument. This value must be positive. * `const size_t k`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. * `const size_t c_offset`: The offset in elements from the start of the output C matrix. * `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for HERK: * When `(transpose == Transpose::kNo && layout == Layout::kColMajor) || (transpose == Transpose::kYes && layout == Layout::kRowMajor)`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`. * The value of `c_ld` must be at least `m`. xSYR2K: Rank-2K update of a symmetric matrix ------------- Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values. C++ API: ``` template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to SYR2K: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const Transpose ab_transpose`: Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const size_t n`: Integer size argument. This value must be positive. * `const size_t k`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix. * `const size_t b_offset`: The offset in elements from the start of the input B matrix. * `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. * `const size_t c_offset`: The offset in elements from the start of the output C matrix. * `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for SYR2K: * When `(transpose == Transpose::kNo && layout == Layout::kColMajor) || (transpose == Transpose::kYes && layout == Layout::kRowMajor)`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`. * When `(transpose == Transpose::kNo && layout == Layout::kColMajor) || (transpose == Transpose::kYes && layout == Layout::kRowMajor)`, then `b_ld` must be at least `n`, otherwise `b_ld` must be at least `k`. * The value of `c_ld` must be at least `n`. xHER2K: Rank-2K update of a hermitian matrix ------------- Same operation as xSYR2K, but _C_ is an Hermitian matrix instead. C++ API: ``` template StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastCher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to HER2K: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const Transpose ab_transpose`: Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const size_t n`: Integer size argument. This value must be positive. * `const size_t k`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix. * `const size_t b_offset`: The offset in elements from the start of the input B matrix. * `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0. * `const U beta`: Input scalar constant. * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. * `const size_t c_offset`: The offset in elements from the start of the output C matrix. * `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for HER2K: * When `(transpose == Transpose::kNo && layout == Layout::kColMajor) || (transpose == Transpose::kYes && layout == Layout::kRowMajor)`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`. * When `(transpose == Transpose::kNo && layout == Layout::kColMajor) || (transpose == Transpose::kYes && layout == Layout::kRowMajor)`, then `b_ld` must be at least `n`, otherwise `b_ld` must be at least `k`. * The value of `c_ld` must be at least `n`. xTRMM: Triangular matrix-matrix multiplication ------------- Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value. C++ API: ``` template StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastStrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to TRMM: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142). * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal. * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `cl_mem b_buffer`: OpenCL buffer to store the output B matrix. * `const size_t b_offset`: The offset in elements from the start of the output B matrix. * `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for TRMM: * When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`. * The value of `b_ld` must be at least `m`. xTRSM: Solves a triangular system of equations ------------- Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_. C++ API: ``` template StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to TRSM: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142). * `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal. * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `cl_mem b_buffer`: OpenCL buffer to store the output B matrix. * `const size_t b_offset`: The offset in elements from the start of the output B matrix. * `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xHAD: Element-wise vector product (Hadamard) ------------- Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_ are vectors and _alpha_ and _beta_ are scalar constants. C++ API: ``` template StatusCode Had(const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const T beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastShad(const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const float beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDhad(const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const double beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastChad(const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const cl_float2 beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhad(const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const cl_double2 beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHhad(const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const cl_half beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) ``` Arguments to HAD: * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t x_offset`: The offset in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. * `const size_t y_offset`: The offset in elements from the start of the input y vector. * `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0. * `const T beta`: Input scalar constant. * `cl_mem z_buffer`: OpenCL buffer to store the output z vector. * `const size_t z_offset`: The offset in elements from the start of the output z vector. * `const size_t z_inc`: Stride/increment of the output z vector. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function) ------------- Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition. C++ API: ``` template StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastComatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) ``` Arguments to OMATCOPY: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `cl_mem b_buffer`: OpenCL buffer to store the output B matrix. * `const size_t b_offset`: The offset in elements from the start of the output B matrix. * `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for OMATCOPY: * The value of `a_ld` must be at least `m`. * The value of `b_ld` must be at least `n`. xIM2COL: Im2col function (non-BLAS function) ------------- Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix. Overwrites any existing values in the _col_ buffer C++ API: ``` template StatusCode Im2col(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) ``` Arguments to IM2COL: * `const KernelMode kernel_mode`: The kernel mode, either `KernelMode::kCrossCorrelation` for the normal mode, or `KernelMode::kConvolution` for the convolution mode that flips a kernel along `h` and `w` axes. * `const size_t channels`: Integer size argument. This value must be positive. * `const size_t height`: Integer size argument. This value must be positive. * `const size_t width`: Integer size argument. This value must be positive. * `const size_t kernel_h`: Integer size argument. This value must be positive. * `const size_t kernel_w`: Integer size argument. This value must be positive. * `const size_t pad_h`: Integer size argument. This value must be positive. * `const size_t pad_w`: Integer size argument. This value must be positive. * `const size_t stride_h`: Integer size argument. This value must be positive. * `const size_t stride_w`: Integer size argument. This value must be positive. * `const size_t dilation_h`: Integer size argument. This value must be positive. * `const size_t dilation_w`: Integer size argument. This value must be positive. * `const cl_mem im_buffer`: OpenCL buffer to store the input im tensor. * `const size_t im_offset`: The offset in elements from the start of the input im tensor. * `cl_mem col_buffer`: OpenCL buffer to store the output col tensor. * `const size_t col_offset`: The offset in elements from the start of the output col tensor. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xCOL2IM: Col2im function (non-BLAS function) ------------- Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix. Accumulates results on top of the existing values in the _im_ buffer. C++ API: ``` template StatusCode Col2im(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastScol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDcol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCcol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZcol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHcol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) ``` Arguments to COL2IM: * `const KernelMode kernel_mode`: The kernel mode, either `KernelMode::kCrossCorrelation` for the normal mode, or `KernelMode::kConvolution` for the convolution mode that flips a kernel along `h` and `w` axes. * `const size_t channels`: Integer size argument. This value must be positive. * `const size_t height`: Integer size argument. This value must be positive. * `const size_t width`: Integer size argument. This value must be positive. * `const size_t kernel_h`: Integer size argument. This value must be positive. * `const size_t kernel_w`: Integer size argument. This value must be positive. * `const size_t pad_h`: Integer size argument. This value must be positive. * `const size_t pad_w`: Integer size argument. This value must be positive. * `const size_t stride_h`: Integer size argument. This value must be positive. * `const size_t stride_w`: Integer size argument. This value must be positive. * `const size_t dilation_h`: Integer size argument. This value must be positive. * `const size_t dilation_w`: Integer size argument. This value must be positive. * `const cl_mem col_buffer`: OpenCL buffer to store the input col tensor. * `const size_t col_offset`: The offset in elements from the start of the input col tensor. * `cl_mem im_buffer`: OpenCL buffer to store the output im tensor. * `const size_t im_offset`: The offset in elements from the start of the output im tensor. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xCONVGEMM: Batched convolution as GEMM (non-BLAS function) ------------- Integrates im2col and GEMM for batched 3D convolution, in which _im_ is the 4D input tensor (NCHW - batch-channelin-height-width), _kernel_ the 4D kernel weights tensor (KCHW - channelout-channelin-height-width), and _result_ the 4D output tensor (NCHW - batch-channelout-height-width). C++ API: ``` template StatusCode Convgemm(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSconvgemm(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDconvgemm(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHconvgemm(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event) ``` Arguments to CONVGEMM: * `const KernelMode kernel_mode`: The kernel mode, either `KernelMode::kCrossCorrelation` for the normal mode, or `KernelMode::kConvolution` for the convolution mode that flips a kernel along `h` and `w` axes. * `const size_t channels`: Integer size argument. This value must be positive. * `const size_t height`: Integer size argument. This value must be positive. * `const size_t width`: Integer size argument. This value must be positive. * `const size_t kernel_h`: Integer size argument. This value must be positive. * `const size_t kernel_w`: Integer size argument. This value must be positive. * `const size_t pad_h`: Integer size argument. This value must be positive. * `const size_t pad_w`: Integer size argument. This value must be positive. * `const size_t stride_h`: Integer size argument. This value must be positive. * `const size_t stride_w`: Integer size argument. This value must be positive. * `const size_t dilation_h`: Integer size argument. This value must be positive. * `const size_t dilation_w`: Integer size argument. This value must be positive. * `const size_t num_kernels`: Integer size argument. This value must be positive. * `const size_t batch_count`: Integer size argument. This value must be positive. * `const cl_mem im_buffer`: OpenCL buffer to store the input im tensor. * `const size_t im_offset`: The offset in elements from the start of the input im tensor. * `const cl_mem kernel_buffer`: OpenCL buffer to store the input kernel tensor. * `const size_t kernel_offset`: The offset in elements from the start of the input kernel tensor. * `cl_mem result_buffer`: OpenCL buffer to store the output result tensor. * `const size_t result_offset`: The offset in elements from the start of the output result tensor. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xAXPYBATCHED: Batched version of AXPY ------------- As AXPY, but multiple operations are batched together for better performance. C++ API: ``` template StatusCode AxpyBatched(const size_t n, const T *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, const float *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, const double *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, const cl_float2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, const cl_double2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, const cl_half *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) ``` Arguments to AXPYBATCHED: * `const size_t n`: Integer size argument. This value must be positive. * `const T *alphas`: Input scalar constants. * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. * `const size_t *x_offsets`: The offsets in elements from the start of the input x vector. * `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. * `cl_mem y_buffer`: OpenCL buffer to store the output y vector. * `const size_t *y_offsets`: The offsets in elements from the start of the output y vector. * `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `const size_t batch_count`: Number of batches. This value must be positive. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. xGEMMBATCHED: Batched version of GEMM ------------- As GEMM, but multiple operations are batched together for better performance. C++ API: ``` template StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const T *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const float *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const double *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_float2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_double2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_half *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) ``` Arguments to GEMMBATCHED: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const Transpose b_transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const size_t k`: Integer size argument. This value must be positive. * `const T *alphas`: Input scalar constants. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t *a_offsets`: The offsets in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix. * `const size_t *b_offsets`: The offsets in elements from the start of the input B matrix. * `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0. * `const T *betas`: Input scalar constants. * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. * `const size_t *c_offsets`: The offsets in elements from the start of the output C matrix. * `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0. * `const size_t batch_count`: Number of batches. This value must be positive. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for GEMMBATCHED: * When `(transpose_a == Transpose::kNo && layout == Layout::kColMajor) || (transpose_a == Transpose::kYes && layout == Layout::kRowMajor)`, then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `k`. * When `(transpose_b == Transpose::kNo && layout == Layout::kColMajor) || (transpose_b == Transpose::kYes && layout == Layout::kRowMajor)`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`. * The value of `c_ld` must be at least `m`. xGEMMSTRIDEDBATCHED: StridedBatched version of GEMM ------------- As GEMM, but multiple strided operations are batched together for better performance. C++ API: ``` template StatusCode GemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) ``` C API: ``` CLBlastStatusCode CLBlastSgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) ``` Arguments to GEMMSTRIDEDBATCHED: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const Transpose b_transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const size_t k`: Integer size argument. This value must be positive. * `const T alpha`: Input scalar constant. * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const size_t a_stride`: The (fixed) stride between two batches of the A matrix. * `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix. * `const size_t b_offset`: The offset in elements from the start of the input B matrix. * `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0. * `const size_t b_stride`: The (fixed) stride between two batches of the B matrix. * `const T beta`: Input scalar constant. * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. * `const size_t c_offset`: The offset in elements from the start of the output C matrix. * `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0. * `const size_t c_stride`: The (fixed) stride between two batches of the C matrix. * `const size_t batch_count`: Number of batches. This value must be positive. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. Requirements for GEMMSTRIDEDBATCHED: * When `(transpose_a == Transpose::kNo && layout == Layout::kColMajor) || (transpose_a == Transpose::kYes && layout == Layout::kRowMajor)`, then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `k`. * When `(transpose_b == Transpose::kNo && layout == Layout::kColMajor) || (transpose_b == Transpose::kYes && layout == Layout::kRowMajor)`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`. * The value of `c_ld` must be at least `m`. GemmTempBufferSize: Retrieves the size of the temporary buffer for GEMM (auxiliary function) ------------- Retrieves the required size of the temporary buffer for the GEMM kernel for specific arguments and for a specific device/platform and tuning parameters. This could be 0 in case no temporary buffer is required. Arguments are similar to those for GEMM. C++ API: ``` template StatusCode GemmTempBufferSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t& temp_buffer_size) ``` C API: ``` CLBlastStatusCode CLBlastSGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size) CLBlastStatusCode CLBlastDGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size) CLBlastStatusCode CLBlastCGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size) CLBlastStatusCode CLBlastZGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size) CLBlastStatusCode CLBlastHGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size) ``` Arguments to GemmTempBufferSize: * `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. * `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const Transpose b_transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. * `const size_t m`: Integer size argument. This value must be positive. * `const size_t n`: Integer size argument. This value must be positive. * `const size_t k`: Integer size argument. This value must be positive. * `const size_t a_offset`: The offset in elements from the start of the input A matrix. * `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. * `const size_t b_offset`: The offset in elements from the start of the input B matrix. * `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0. * `const size_t c_offset`: The offset in elements from the start of the output C matrix. * `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `size_t& temp_buffer_size`: The result of this function: the required buffer size. ClearCache: Resets the cache of compiled binaries (auxiliary function) ------------- CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on for the same device. This cache can be cleared to free up system memory or it can be useful in case of debugging. C++ API: ``` StatusCode ClearCache() ``` C API: ``` CLBlastStatusCode CLBlastClearCache() ``` FillCache: Populates the cache of compiled binaries for a specific device (auxiliary function) ------------- CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on for the same device. This cache is automatically populated whenever a new binary is created. Thus, the first run of a specific kernel could take extra time. For debugging or performance evaluation purposes, it might be useful to populate the cache upfront. This function populates the cache for all kernels in CLBlast for all precisions, but for a specific device only. C++ API: ``` StatusCode FillCache(const cl_device_id device) ``` C API: ``` CLBlastStatusCode CLBlastFillCache(const cl_device_id device) ``` Arguments to FillCache: * `const cl_device_id device`: The OpenCL device to fill the cache for. RetrieveParameters: Retrieves current tuning parameters (auxiliary function) ------------- This function retrieves current tuning parameters for a specific device-precision-kernel combination. This can be used for debugging or inspection. See [tuning.md](tuning.md) for more details on which kernel names and parameters are valid. C++ API: ``` StatusCode RetrieveParameters(const cl_device_id device, const std::string &kernel_name, const Precision precision, std::unordered_map ¶meters) ``` A C API is not available for this function. Arguments to RetrieveParameters (C++ version): * `const cl_device_id device`: The OpenCL device to query the parameters for. * `const std::string &kernel_name`: The target kernel name. This has to be one of the existing CLBlast kernels (Xaxpy, Xdot, Xgemv, XgemvFast, XgemvFastRot, Xgemv, Xger, Copy, Pad, Transpose, Padtranspose, Xgemm, or XgemmDirect). If this argument is incorrect, this function will return with the `clblast::kInvalidOverrideKernel` status-code. * `const Precision precision`: The CLBlast precision enum to query the parameters for. * `std::unordered_map ¶meters`: An unordered map of strings to integers. This will be filled with the current tuning parameters for a specific kernel. OverrideParameters: Override tuning parameters (auxiliary function) ------------- This function overrides tuning parameters for a specific device-precision-kernel combination. The next time the target routine is called it will be re-compiled and use the new parameters. All further times (until `OverrideParameters` is called again) it will load the kernel from the cache and thus continue to use the new parameters. Note that the first time after calling `OverrideParameters` a performance drop can be observable due to the re-compilation of the kernel. See [tuning.md](tuning.md) for more details on which kernel names and parameters are valid. C++ API: ``` StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name, const Precision precision, const std::unordered_map ¶meters) ``` C API: ``` CLBlastStatusCode CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name, const CLBlastPrecision precision, const size_t num_parameters, const char** parameters_names, const size_t* parameters_values) ``` Arguments to OverrideParameters (C++ version): * `const cl_device_id device`: The OpenCL device to set the new parameters for. * `const std::string &kernel_name`: The target kernel name. This has to be one of the existing CLBlast kernels (Xaxpy, Xdot, Xgemv, XgemvFast, XgemvFastRot, Xgemv, Xger, Copy, Pad, Transpose, Padtranspose, Xgemm, or XgemmDirect). If this argument is incorrect, this function will return with the `clblast::kInvalidOverrideKernel` status-code. * `const Precision precision`: The CLBlast precision enum to set the new parameters for. * `const std::unordered_map ¶meters`: An unordered map of strings to integers. This has to contain all the tuning parameters for a specific kernel as reported by the included tuners (e.g. `{ {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} }` for the `Copy` kernel). If this argument is incorrect, this function will return with the `clblast::kMissingOverrideParameter` status-code. Tune: Run the tuner for a particular kernel (advanced usage) ------------- The CLBlast kernels can be tuned using the tuning binaries, but also programmatically through an API. This is only recommended for advanced usage, see for more information [the tuning docs](tuning.md). C++ API: ``` // Tunes the "Xaxpy" kernel, used for many level-1 routines such as XAXPY, XCOPY, and XSWAP template StatusCode PUBLIC_API TuneXaxpy(cl_command_queue* queue, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Xdot" kernel, used for level-1 reduction routines such as XDOT, XMAX, and XSUM template StatusCode PUBLIC_API TuneXdot(cl_command_queue* queue, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Xgemv" kernel, used for matrix-vector level-2 routines such as XGEMV, XGBMV, and XHEMV template StatusCode PUBLIC_API TuneXgemv(cl_command_queue* queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Xger" kernel, used for matrix update level-2 routines such as XGER, XHER, and XSYR2 template StatusCode PUBLIC_API TuneXger(cl_command_queue* queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Xgemm" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K template StatusCode PUBLIC_API TuneXgemm(cl_command_queue* queue, const size_t m, const size_t n, const size_t k, const double fraction, std::unordered_map ¶meters); // Tunes the "XgemmDiret" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K template StatusCode PUBLIC_API TuneXgemmDirect(cl_command_queue* queue, const size_t m, const size_t n, const size_t k, const double fraction, std::unordered_map ¶meters); // Tunes the "Copy" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K template StatusCode PUBLIC_API TuneCopy(cl_command_queue* queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Pad" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K template StatusCode PUBLIC_API TunePad(cl_command_queue* queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Transpose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K template StatusCode PUBLIC_API TuneTranspose(cl_command_queue* queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Padtranspose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K template StatusCode PUBLIC_API TunePadtranspose(cl_command_queue* queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Xgemm" kernel, used for the level-3 routine XTRSM template StatusCode PUBLIC_API TuneInvert(cl_command_queue* queue, const size_t m, const size_t n, const size_t k, const double fraction, std::unordered_map ¶meters); ``` Arguments to Tune (C++ version): * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to tune the kernel for. * `const size_t m`: The routine argument `m` to tune for (not applicable for all kernels) * `const size_t n`: The routine argument `n` to tune for * `const size_t k`: The routine argument `k` to tune for (not applicable for all kernels) * `const double fraction`: A value between 0.0 and 1.0 which determines the fraction of the tuning search space to explore. * `std::unordered_map ¶meters`: An unordered map of strings to integers. This will return the best found tuning parameters. CLBlast-1.6.3/doc/benchmarking.md000066400000000000000000000044041463263031500165310ustar00rootroot00000000000000CLBlast: Performance measuring and benchmarking ================ This document describes how to measure the performance of CLBlast and how to compare it against other libraries. For other information about CLBlast, see the [main README](../README.md). Compiling the performance tests ('clients') ------------- To test the performance of CLBlast and to compare optionally against [clBLAS](http://github.com/clMathLibraries/clBLAS), cuBLAS (if testing on an NVIDIA GPU and `-DCUBLAS=ON` is set), or a CPU BLAS library (if installed), compile with the clients enabled by specifying `-DCLIENTS=ON`, for example as follows: cmake -DCLIENTS=ON .. The performance tests come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against optionally clBLAS and/or a CPU BLAS library. You can use the command-line options `-clblas 1`, `-cblas 1`, or `-cublas 1` to select a library to test against. Benchmarking ------------- On [the CLBlast website](https://cnugteren.github.io/clblast) you will find performance results for various devices. Performance is compared in this case against a tuned version of the clBLAS library and optionally also against cuBLAS. Such graphs can be generated automatically on your own device as well. First, compile CLBlast with the clients enabled (see above). Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable (shipped with clBLAS). Finally, run the Python/Matplotlib graph-script found in `scripts/benchmark/benchmark.py`. For example, to generate the SGEMM PDF on device 1 of platform 0 from the `build` subdirectory: python ../scripts/benchmark/benchmark.py --platform 0 --device 1 --benchmark gemm Note that the CLBlast library provides pre-tuned parameter-values for some devices only: if your device is not among these, then out-of-the-box performance might be poor. See the [tuning README](tuning.md) to find out how to tune for your device. In case performance is still sub-optimal or something else is wrong, CLBlast can be build in verbose mode for (performance) debugging by specifying `-DVERBOSE=ON` to CMake. CLBlast-1.6.3/doc/bindings.md000066400000000000000000000034771463263031500157070ustar00rootroot00000000000000CLBlast: Bindings / wrappers for other languages ================ The main APIs of CLBlast are C and C++ for OpenCL or CUDA. This document describes other APIs for other languages through bindings and wrappers. For other information about CLBlast, see the [main README](../README.md). Plain C: Netlib BLAS API ------------- CLBlast provides a Netlib CBLAS C API. This is however not recommended for performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severely. However, it can be useful if you don't want to touch OpenCL at all. Providing the `-DNETLIB=ON` flag to CMake at CLBlast compilation time will compile the Netlib API. Then, it can be used by including the corresponding header: #include The OpenCL device and platform can be set by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. Python: PyCLBlast ------------- PyCLBlast provides Python bindings for CLBlast. It is integrated in the main CLBlast project and can be installed through `pip`. Details can be found in the [PyCLBlast README](https://github.com/CNugteren/CLBlast/tree/master/src/pyclblast) or on [PyPi](https://pypi.python.org/pypi/pyclblast). Java: JOCLBlast (3rd party) ------------- JOCLBlast is a 3rd party project providing bindings for Java. It is built on top of JOCL. Details can be found on the [JOCLBlast Github project page](https://github.com/gpu/JOCLBlast). Nim: nim-CLBlast (3rd party) ------------- A 3rd party CLBlast wrapper for the nim language is available [here](https://github.com/numforge/nim-clblast). Julia: CLBlast.jl (3rd party) ------------- A 3rd party CLBlast wrapper for [Julia](https://julialang.org/) is available [here](https://github.com/JuliaGPU/CLBlast.jl). CLBlast-1.6.3/doc/details_conv.md000066400000000000000000000035371463263031500165610ustar00rootroot00000000000000CLBlast: Details on the CONVGEMM routine ================ This document gives a bit more detail on how the CONVGEMM routine is organised and implemented. For other information about CLBlast, see the [main README](../README.md). CONVGEMM: Two approaches ------------- CLBlast implements two approaches to batched convolutions using GEMM: through im2col, or stand-alone: * `ConvGemmMethod::kWithIm2Col`: running first a batched version of im2col to prepare the data into a temporary buffer, and then running a batched version of GEMM. The implementation is just as the regular im2col and GEMM kernels in CLBlast, but it is implemented as a separate kernel so all the non-needed features can be stripped out and some optimizations can be made. It uses the tuning parameters of the regular im2col and GEMM kernels. * `ConvGemmMethod::kSingleKernel`: this is a single kernel approach: it loads the data in such a way that the im2col kernel is no longer needed, i.e. loading the data as the im2col transformation does it. That way it becomes a single kernel and there will be no need for an intermediate large buffer. It uses a separate set of tuning parameters, and can be tuned using the `clblast_tuner_xconvgemm` binary. CONVGEMM: Selecting which approach to use ------------- Since CONVGEMM is a relatively new and experimental feature, selection of the approach is hard-coded in [xconvgemm.hpp on line 32](../src/routines/levelx/xconvgemm.hpp:32), but can be changed there in a single place. The main drawback of the `ConvGemmMethod::kWithIm2Col` approach is its extra memory usage, but depending on the device and setting, it might be faster compared to the `ConvGemmMethod::kSingleKernel` approach. The latter has as extra advantage that it has its own tuning parameters, so it can be fine-tuned for your specific use-case a bit better than the 2-kernel approach with im2col. CLBlast-1.6.3/doc/details_gemm.md000066400000000000000000000051301463263031500165300ustar00rootroot00000000000000CLBlast: Details on the GEMM routine and kernel ================ This document gives a bit more detail on how the GEMM routine is organised and implemented. For other information about CLBlast, see the [main README](../README.md). GEMM: Two approaches ------------- CLBlast implements two approaches to GEMM: direct and indirect: * Direct GEMM: Computing GEMM using a single generic kernel which handles all cases (e.g. all kinds of matrix sizes). * Indirect GEMM: Computing GEMM using multiple kernels: the main GEMM kernel and a few pre-processing and post-processing kernels. The main kernel makes several assumptions (e.g. sizes need to be multiples of 32), which the other kernels make sure are satisfied. The main kernel is often faster than the generic kernel of the direct approach, but the cost of pre-processing and post-processing kernels can sometimes be high for small sizes or particular devices. GEMM: In-direct approach ------------- Similar to the work by Matsumoto et al. ("Performance Tuning of Matrix Multiplication in OpenCL on Different GPUs and CPUs"), the main GEMM kernel makes many assumptions on the input arguments, which are handled by pre-processing and post-processing kernels. These assumptions are e.g. matrix sizes are a multiple of the work-group sizes, offsets are zero, and matrix B is transposed. This is a good solution for larger problem sizes since O(n^2) data movement is typically cheaper than O(n^3) computation, but the hidden constant starts to play a role for smaller n. Therefore, there is also a single-kernel direct version available for those cases, but it shares most of the design and parameters as discussed below. The main kernel has 14 different parameters, of which some are illustrated in figure 1 in the [CLBlast paper](https://arxiv.org/pdf/1705.05249). The parameters define among others the work-group sizes in 2 dimensions (MWG, NWG), the 2D register tiling configuration (MWI, NWI), the vector widths of both input matrices (VWM, VWN), loop unroll factors (KWI), and whether or not and how to use the local memory. GEMM: Direct approach ------------- This is a single-kernel approach that shared many of the parameters for the in-direct kernel. One of the differences is that within the kernel there are checks for incomplete tiles in the m/n/k dimensions, influenced by the tuning parameters and the matrix sizes. These incomplete tiles will run a different part of the code, as they for example cannot benefit from vectorisation. Another difference is that there are dedicated kernels for each a/b transpose requirement: NN, NT, TN, TT for non-transposed and transposed.CLBlast-1.6.3/doc/faq.md000066400000000000000000000077521463263031500146610ustar00rootroot00000000000000CLBlast: FAQ ================ This document answers some frequently asked questions. For other information about CLBlast, see the [main README](../README.md). What is the difference between the direct and indirect GEMM kernel? ------------- There are two ways to perform GEMM implemented in CLBlast: * __Direct GEMM__: Computing GEMM using a single generic kernel which handles all cases (e.g. all kinds of matrix sizes). * __Indirect GEMM__: Computing GEMM using multiple kernels: the main GEMM kernel and a few pre-processing and post-processing kernels. The main kernel makes several assumptions (e.g. sizes need to be multiples of 32), which the other kernels make sure are satisfied. The main kernel is often faster than the generic kernel of the direct approach, but the cost of pre-processing and post-processing kernels can sometimes be high for small sizes or particular devices. The GEMM routine tuner will find out from which m/n/k sizes onwards the indirect approach is favorable over the direct approach. Typically the direct approach is faster for small matrices. What is the difference between the GEMMK=0 and GEMMK=1 kernels? ------------- For the indirect GEMM kernel (see above) there are basically two implementations, an older approach (GEMMK=0) and a newer kernel with 2D register tiling and support for shuffling (GEMMK=1). On most device the old approach is still the fastest, but some devices can benefit more from the other kernel. The regular GEMM kernel tuner will explore both kernels, making sure to select the fastest one. The GEMM tuner runs in 4 stages, what are they? ------------- The regular GEMM tuner tunes the indirect kernel (see above), tuning for the GEMMK=0 kernel first (stage 1/4 and 2/4) followed by the GEMMK=1 variant (stage 3/4 and 4/4). In both cases, first a fixed set of likely-to-be-good parameters is explored fully (1/4 and 3/4), followed by a random selection of parameters in a much larger search space (2/4 and 4/4). In the end the library will only care about the final best kernel configuration among all 4 stages. The direct GEMM tuner runs in 2 stages: as above, it first explores a small set of parameters exhaustively, followed by a random selection of a larger search space. The GEMM routine uses too much memory or results in error -4, what can I do? ------------- By design the indirect version of the GEMM kernel might allocate some temporarily memory on your device, and that might be an issue in some scenarios. However, there are a few things you could do to avoid this: * Use the [override parameters](../include/clblast.h#L717) functionality to set the switching point between direct and in-direct kernels much further. Example [here in one of the tests](../test/routines/level3/xgemm.hpp#L73). This might affect the performance of the GEMM routine. * [Query the required buffer size](../include/clblast.h#L691), allocate the buffer yourself, and pass that to [GEMM](../include/clblast.h#L525). That way you are in control and can make sure it is only allocated once for example. * Make sure no temporary buffer is required. Thus, make sure the buffer size is already a multiple of the amount of work done per work-group, e.g. 32, 64 or 128 at most depending on the tuned values for your device (you can query them if wanted). Then also make sure they are pre-transposed as needed. The [query-temp-buffer-size function](../include/clblast.h#L691) and its implementation can help you figure out if you are there yet. The tuners occasionally report failures or errors, is this an issue? ------------- The tuners explore many different kernel parameters, sometimes quite extreme, seeking the bounds of the hardware or resulting in very large binaries. Depending on your device and OpenCL implementation, it might well be that failures occur. However, the tuner will automatically detect incorrect results or failed kernels, and will skip them. Only if the amount of failures is very large, something might be wrong in the CLBlast code. In that case, it can be reported as an issue. CLBlast-1.6.3/doc/glossary.md000066400000000000000000000024351463263031500157460ustar00rootroot00000000000000CLBlast: Glossary ================ This document describes some commonly used terms in CLBlast documentation and code. For other information about CLBlast, see the [main README](../README.md). * __BLAS__: The set of 'Basic Linear Algebra Subroutines'. * __Netlib BLAS__: The official BLAS API definition, with __CBLAS__ providing the C headers. * __OpenCL__: The open compute language, a Khronos standard for heterogeneous and parallel computing, e.g. on GPUs. * __kernel__: An OpenCL parallel program that runs on the target device. * __clBLAS__: Another OpenCL BLAS library, maintained by AMD. * __cuBLAS__: The main CUDA BLAS library, maintained by NVIDIA. * __GEMM__: The 'GEneral Matrix Multiplication' routine. * __Direct GEMM__: Computing GEMM using a single generic kernel which handles all cases (e.g. all kinds of matrix sizes). * __Indirect GEMM__: Computing GEMM using multiple kernels: the main GEMM kernel and a few pre-processing and post-processing kernels. The main kernel makes several assumptions (e.g. sizes need to be multiples of 32), which the other kernels make sure are satisfied. The main kernel is often faster than the generic kernel of the direct approach, but the cost of pre-processing and post-processing kernels can sometimes be high for small sizes or particular devices. CLBlast-1.6.3/doc/installation.md000066400000000000000000000131601463263031500166010ustar00rootroot00000000000000CLBlast: Building and installing ================ This document describes how to compile, link, and install CLBlast on various platforms. You can either use a pre-built package or compile the library from source. For other information about CLBlast, see the [main README](../README.md). Requirements ------------- The pre-requisites for compilation of CLBlast are kept as minimal as possible. A basic compilation infrastructure is all you need, no external dependencies are required. You'll need: * CMake version 2.8.10 or higher * A C++11 compiler, for example: - GCC 4.7.0 or newer - Clang 3.3 or newer - AppleClang 5.0 or newer - ICC 14.0 or newer - MSVC (Visual Studio) 2013 or newer * An OpenCL 1.1 or newer library, for example: - Apple OpenCL - NVIDIA CUDA SDK - AMD APP SDK - Intel OpenCL - Beignet - Mesa Clover - ARM Mali OpenCL - Vivante OpenCL - POCL Using pre-built packages ------------- There are pre-built binaries available for Ubuntu, Debian, macOS, and Windows. CLBlast is in Ubuntu 21.04 (Hirsute Hippo) and Debian 11 (Bullseye) and later, and can be installed with: sudo apt install libclblast-dev This may not be the latest version of CLBlast. The latest should be available in [Debian unstable](https://tracker.debian.org/pkg/clblast), or can be built from source as described below. Users of older versions of Ubuntu can use [this PPA](https://launchpad.net/~cnugteren/+archive/ubuntu/clblast). For Arch Linux and Manjaro, CLBlast is available as a [package](https://aur.archlinux.org/packages/clblast-git) maintained by a 3rd party. For OS X / macOS, CLBlast is available through [Homebrew](https://github.com/Homebrew/homebrew-core/blob/master/Formula/clblast.rb). It can be installed as follows: brew update brew install clblast For Windows, binaries are provided in a .zip file on Github as part of the [CLBlast release page](https://github.com/CNugteren/CLBlast/releases). Linux / macOS compilation from source ------------- Configuration can be done using CMake. On Linux and macOS systems with make, building is straightforward. Here's an example of an out-of-source build using a command-line compiler and make (starting from the root of the CLBlast folder): mkdir build cd build cmake .. make sudo make install # (optional) A custom installation folder can be specified when calling CMake: cmake -DCMAKE_INSTALL_PREFIX=/path/to/install/directory .. Building a static version of the library instead of shared one (.dylib/.so) can be done by disabling the `BUILD_SHARED_LIBS` option when calling CMake. For example: cmake -DBUILD_SHARED_LIBS=OFF .. In case you run into segfaults with OpenCL programs (known to happen with the AMD APP), you can try the following (thanks to [kpot](https://github.com/CNugteren/CLBlast/issues/243#issuecomment-367277297)): 1. Use `-fPIC` or its analogue when compiling. In CMake you can do this by adding `set(CMAKE_POSITION_INDEPENDENT_CODE ON)` to the project config. 2. Forbid CMake to add RPATH entries to binaries. You can do this project-wise with `set(CMAKE_SKIP_BUILD_RPATH ON)` in CMake. Windows compilation from source ------------- When using Visual Studio 2015, the project-files can be generated as follows: mkdir build cd build cmake -G "Visual Studio 14 Win64" .. For another version, replace 14 with the appropriate version (12 for VS 2013, 15 for VS 2017). To generate a static version of the library instead of a .dll, specify `-DBUILD_SHARED_LIBS=OFF` when running cmake. Android compilation from source ------------- For deployment on Android, there are three options to consider. First of all, you can use Google's recommended route of installing Android Studio with the NDK, and then use the JNI to interface to the CLBlast library. For this, we refer to the official Android Studio documentation and the online tutorials. Alternatively, you can cross-compile the library and the test/client/tuner executables directly. To do so, first install the NDK, then find your vendor's OpenCL library (e.g. in `/system/vendor/lib`), get OpenCL headers from the Khronos registry, and invoke CMake as follows: cmake .. \ -DCMAKE_SYSTEM_NAME=Android \ -DCMAKE_SYSTEM_VERSION=19 \ # Set the appropriate Android API level -DCMAKE_ANDROID_ARCH_ABI=armeabi-v7a \ # Set the appropriate device architecture (e.g. armeabi-v7a or arm64-v8a) -DCMAKE_ANDROID_NDK=$ANDROID_NDK_PATH \ # Assumes $ANDROID_NDK_PATH points to your NDK installation -DCMAKE_ANDROID_STL_TYPE=gnustl_static \ -DOPENCL_ROOT=/path/to/vendor/OpenCL/lib/folder/ # Should contain libOpenCL.so and CL/cl.h For any potential issues, first check [cmath 'has not been declared' errors](https://stackoverflow.com/questions/45183525/compilation-error-with-ndk-using-cstatic/46433625). Also, if you are encountering errors such as `#error Bionic header ctype.h does not define either _U nor _CTYPE_U`, make sure CMake is not including system paths. Finally, a third option is to use the [Collective Knowledge framework](https://github.com/ctuning/ck) in combination with the NDK, e.g. as follows: sudo pip install ck ck pull repo:ck-math ck install package:lib-clblast-master-universal --target_os=android21-arm64 Compiling CLBlast with a CUDA back-end ------------- There is also a CUDA API of CLBlast available. Enabling this compiles the whole library for CUDA and thus replaces the OpenCL API. It is based upon the CUDA runtime and NVRTC APIs, requiring NVIDIA CUDA 7.5 or higher. The CUDA version of the library can be used as follows after providing the `-DCUDA=ON -DOPENCL=OFF` flags to CMake: #include CLBlast-1.6.3/doc/routines.md000066400000000000000000000137071463263031500157570ustar00rootroot00000000000000CLBlast: Supported routines overview ================ This document describes which routines are supported in CLBlast. For other information about CLBlast, see the [main README](../README.md). Full API documentation is available in a separate [API documentation file](api.md). Supported types ------------- The different data-types supported by the library are: * __S:__ Single-precision 32-bit floating-point (`float`). * __D:__ Double-precision 64-bit floating-point (`double`). * __C:__ Complex single-precision 2x32-bit floating-point (`std::complex`). * __Z:__ Complex double-precision 2x64-bit floating-point (`std::complex`). * __H:__ Half-precision 16-bit floating-point (`cl_half`). See section 'Half precision' below for more information. Supported routines ------------- CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-BLAS routines. The supported BLAS routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all. | Level-1 | S | D | C | Z | H | | ---------|---|---|---|---|---| | xSWAP | ✔ | ✔ | ✔ | ✔ | ✔ | | xSCAL | ✔ | ✔ | ✔ | ✔ | ✔ | | xCOPY | ✔ | ✔ | ✔ | ✔ | ✔ | | xAXPY | ✔ | ✔ | ✔ | ✔ | ✔ | | xDOT | ✔ | ✔ | - | - | ✔ | | xDOTU | - | - | ✔ | ✔ | - | | xDOTC | - | - | ✔ | ✔ | - | | xNRM2 | ✔ | ✔ | ✔ | ✔ | ✔ | | xASUM | ✔ | ✔ | ✔ | ✔ | ✔ | | IxAMAX | ✔ | ✔ | ✔ | ✔ | ✔ | | Level-2 | S | D | C | Z | H | | ---------|---|---|---|---|---| | xGEMV | ✔ | ✔ | ✔ | ✔ | ✔ | | xGBMV | ✔ | ✔ | ✔ | ✔ | ✔ | | xHEMV | - | - | ✔ | ✔ | - | | xHBMV | - | - | ✔ | ✔ | - | | xHPMV | - | - | ✔ | ✔ | - | | xSYMV | ✔ | ✔ | - | - | ✔ | | xSBMV | ✔ | ✔ | - | - | ✔ | | xSPMV | ✔ | ✔ | - | - | ✔ | | xTRMV | ✔ | ✔ | ✔ | ✔ | ✔ | | xTBMV | ✔ | ✔ | ✔ | ✔ | ✔ | | xTPMV | ✔ | ✔ | ✔ | ✔ | ✔ | | xGER | ✔ | ✔ | - | - | ✔ | | xGERU | - | - | ✔ | ✔ | - | | xGERC | - | - | ✔ | ✔ | - | | xHER | - | - | ✔ | ✔ | - | | xHPR | - | - | ✔ | ✔ | - | | xHER2 | - | - | ✔ | ✔ | - | | xHPR2 | - | - | ✔ | ✔ | - | | xSYR | ✔ | ✔ | - | - | ✔ | | xSPR | ✔ | ✔ | - | - | ✔ | | xSYR2 | ✔ | ✔ | - | - | ✔ | | xSPR2 | ✔ | ✔ | - | - | ✔ | | xTRSV | ✔ | ✔ | ✔ | ✔ | | | Level-3 | S | D | C | Z | H | | ---------|---|---|---|---|---| | xGEMM | ✔ | ✔ | ✔ | ✔ | ✔ | | xSYMM | ✔ | ✔ | ✔ | ✔ | ✔ | | xHEMM | - | - | ✔ | ✔ | - | | xSYRK | ✔ | ✔ | ✔ | ✔ | ✔ | | xHERK | - | - | ✔ | ✔ | - | | xSYR2K | ✔ | ✔ | ✔ | ✔ | ✔ | | xHER2K | - | - | ✔ | ✔ | - | | xTRMM | ✔ | ✔ | ✔ | ✔ | ✔ | | xTRSM | ✔ | ✔ | ✔ | ✔ | | Furthermore, there are also batched versions of BLAS routines available, processing multiple smaller computations in one go for better performance: | Batched | S | D | C | Z | H | | --------------------|---|---|---|---|---| | xAXPYBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ | | xGEMMBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ | | xGEMMSTRIDEDBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ | In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care: | Level-X | S | D | C | Z | H | | -----------|---|---|---|---|---| | xSUM | ✔ | ✔ | ✔ | ✔ | ✔ | (Similar to xASUM, but not absolute) | IxAMIN | ✔ | ✔ | ✔ | ✔ | ✔ | (Similar to IxAMAX, but minimum instead of maximum) | IxMAX | ✔ | ✔ | ✔ | ✔ | ✔ | (Similar to IxAMAX, but not absolute) | IxMIN | ✔ | ✔ | ✔ | ✔ | ✔ | (Similar to IxAMAX, but not absolute and minimum instead of maximum) | xHAD | ✔ | ✔ | ✔ | ✔ | ✔ | (Hadamard product) | xOMATCOPY | ✔ | ✔ | ✔ | ✔ | ✔ | (Out-of-place copying/transposing/scaling of matrices) | xIM2COL | ✔ | ✔ | ✔ | ✔ | ✔ | (Image to column transform as used to express convolution as GEMM) | xCOL2IM | ✔ | ✔ | ✔ | ✔ | ✔ | (Column to image transform as used in machine learning) | xCONVGEMM | ✔ | ✔ | - | - | ✔ | (Experimental, implemented as either im2col followed by batched GEMM or as a single kernel) Some less commonly used BLAS routines are not yet supported by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV. Half precision (fp16) ------------- The half-precision fp16 format is a 16-bits floating-point data-type. Some OpenCL devices support the `cl_khr_fp16` extension, reducing storage and bandwidth requirements by a factor 2 compared to single-precision floating-point. In case the hardware also accelerates arithmetic on half-precision data-types, this can also greatly improve compute performance of e.g. level-3 routines such as GEMM. Devices which can benefit from this are among others Intel GPUs, ARM Mali GPUs, and NVIDIA's latest Pascal GPUs. Half-precision is in particular interest for the deep-learning community, in which convolutional neural networks can be processed much faster at a minor accuracy loss. Since there is no half-precision data-type in C or C++, OpenCL provides the `cl_half` type for the host device. Unfortunately, internally this translates to a 16-bits integer, so computations on the host using this data-type should be avoided. For convenience, CLBlast provides the `clblast_half.h` header (C99 and C++ compatible), defining the `half` type as a short-hand to `cl_half` and the following basic functions: * `half FloatToHalf(const float value)`: Converts a 32-bits floating-point value to a 16-bits floating-point value. * `float HalfToFloat(const half value)`: Converts a 16-bits floating-point value to a 32-bits floating-point value. The [samples/haxpy.c](../samples/haxpy.c) example shows how to use these convenience functions when calling the half-precision BLAS routine HAXPY. CLBlast-1.6.3/doc/testing.md000066400000000000000000000034631463263031500155620ustar00rootroot00000000000000CLBlast: Testing the library for correctness ================ This document describes how to test the library. For other information about CLBlast, see the [main README](../README.md). Compiling the correctness tests ------------- To make sure CLBlast is working correctly on your device (recommended), compile with the tests enabled by specifying `-DTESTS=ON`, for example as follows: cmake -DTESTS=ON .. To build these tests, another BLAS library is needed to serve as a reference. This can be either: * The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS) (maintained by AMD) * A regular CPU Netlib BLAS library, e.g.: - OpenBLAS - BLIS - Accelerate Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Running the tests ------------- All tests can be run as individual executables or directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. Further options can be supplied through the `CLBLAST_ARGUMENTS` environmental variable (e.g. export CLBLAST_ARGUMENTS="-full_test -cblas 1 -clblas 0" on a UNIX system). Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further. CLBlast-1.6.3/doc/tuning.md000066400000000000000000000364171463263031500154160ustar00rootroot00000000000000CLBlast: Tuning for better performance ================ This document describes how to tune CLBlast for better performance and lists for which devices tuned kernels are already available. For other information about CLBlast, see the [main README](../README.md). Already tuned-for devices ------------- The CLBlast library is already tuned for the most commonly used OpenCL devices and it's gradually being extended to other devices as well. For unseen devices CLBlast will make use of common-best tuning values for similar architectures (e.g. AMD Fiji) or in general similar devices (e.g. AMD GPUs), so performance might still be decent. The current release of CLBlast is tuned for the following devices: * NVIDIA GPUs: - SM 2.0: - GeForce GTX 480 - GeForce GTX 580 - SM 3.0: - GRID K520 - GeForce GT 650M - GeForce GTX 670 - GeForce GTX 680 - GeForce GTX 760 Ti - SM 3.5: - GeForce GT 730 - GeForce 920A - GeForce GTX TITAN - GeForce GTX TITAN Black - Tesla K20m - Tesla K40m - SM 5.0: - GeForce GTX 920MX - GeForce GTX 750 - GeForce GTX 750 Ti - Quadro M2000M - SM 5.2: - GeForce GTX 970 - GeForce GTX 980 - GeForce GTX TITAN X - SM 6.0: - Tesla P100 16GB - SM 6.1: - GeForce MX 150 - GeForce GTX 1060 6GB - GeForce GTX 1070 - GeForce GTX 1070 Ti - GeForce GTX 1080 - GeForce GTX 1080 Ti - TITAN X (Pascal) - Tesla P4 - SM 7.0: - Quadro GV100 - Tesla V100 - SM 7.5: - GeForce MX 450 - GeForce GTX 1650 - GeForce GTX 1650 Ti - GeForce GTX 1650 Super - GeForce GTX 2060 - GeForce GTX 2070 with Max-Q - GeForce GTX 2070 Super - GeForce GTX 2080 with Max-Q - GeForce GTX 2080 Ti - Quadro T2000 - TITAN RTX - Tesla T4 - SM 8.0: - Tesla A100 40GB - SM 8.6: - GeForce GTX 3050 Ti Laptop - GeForce GTX 3060 Laptop - GeForce GTX 3070 - GeForce GTX 3070 Ti Laptop - GeForce GTX 3080 - GeForce GTX 3080 Laptop - GeForce GTX 3080 Ti - GeForce GTX 3090 - RTX A6000 - SM 8.9: - GeForce GTX 4050 Laptop - GeForce RTX 4060 - GeForce GTX 4060 Ti - GeForce GTX 4070 Laptop - GeForce GTX 4070 Ti - GeForce GTX 4080 - GeForce GTX 4090 * AMD GPUs: - Turks: - Radeon HD 6770M - Vancouver: - Radeon HD 6750M - Tahiti: - Radeon HD 7970 - Oland: - Radeon R7 250 - Pitcairn: - Radeon R9 270X - Hawaii: - FirePro W8100 - Radeon R9 290X - Tonga: - Radeon R9 380 - Fiji: - Radeon 500 - Radeon R9 Fury X - Radeon R9 M370X - Ellesmere: - Radeon RX 480 - Radeon RX 580 2048SP - Radeon RX 590 GME - Vega: - Radeon RX Vega - gfx902: - Radeon RX Vega - Radeon RX Vega 10 - gfx906: - Radeon VII - gfx90c: - Ryzen 5600G APU - Ryzen 5700G APU - gfx1010: - Radeon RX 5700 - Radeon RX 5700 XT - gfx1030: - Radeon RX 6800 XT - Radeon RX 6900 XT - gfx1031: - Radeon RX 6700 XT - gfx1032: - Radeon RX 6600 XT - gfx1034: - Radeon RX 6500 XT - gfx1035: - Radeon 680M - Ryzen 4600G APU - gfx1100: - Radeon RX 7900 XTX - gfx1101: - Radeon RX 7800 XT - gfx1102: - Radeon RX 7600 - gfx1103: - Radeon 780M - Other: - Radeon Pro 450 - Radeon Pro 580 * Intel GPUs: - HD Graphics 530 - HD Graphics 5500 BroadWell U-Processor GT2 - HD Graphics 6000 BroadWell U-Processor GT3 - HD Graphics Haswell Ultrabook GT2 Mobile - HD Graphics IvyBridge M GT2 - HD Graphics Skylake ULT GT2 - UHD Graphics 620 - UHD Graphics 630 - UHD Graphics 770 - Iris - Iris Pro - Iris Xe Graphics - RaptorLake-S Mobile Graphics - Arc A750 - Arc A770 * Intel CPUs: - Core i5-4570 - Core i5-4590S - Core i5-6200U - Core i7-920 - Core i7-2670QM - Core i7-3770K - Core i7-4790K - Core i7-5930K - Core i7-6770HQ - Core i7-12700H - Core i9-9980HK - Xeon E5-2630 v3 - Xeon E5-2630 v4 * Other devices: - ARM Mali-T628 GPU - ARM Mali-T760 GPU - ARM Mali-G57 MC2 GPU - Qualcomm Adreno 330 GPU - Qualcomm Adreno 540 GPU - Qualcomm Adreno 640 GPU - Qualcomm Adreno 730 GPU - Qualcomm Adreno 740 GPU - Intel MIC - Imagination PowerVR B-Series BXE-4-32 - Apple M1 GPU - Apple M2 Max GPU If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should run the included tuners. Compiling and running the tuners ------------- The included CLBlast tuners are compiled with the default CMake options. If they are not compiled, make sure you are specifing `-DTUNERS=ON`, for example as follows: cmake -DTUNERS=ON .. Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. Alternatively, you can also manually run each of the tuners for each of the precisions. Here's an example to tune the `axpy` kernels for 64-bit precision on device 0 of platform 0: ./clblast_tuner_xaxpy --precision 64 --device 0 --platform 0 The kernels `gemm` and `gemm_direct` have too many parameters to explore. Therefore, they will run in two stages: a first stage with a fixed limited number of parameter combinations, and a second stage with a random selection from a much larger search space. The random fraction is determined by the `fraction` argument on the command-line. There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. However, they do automatically pick up kernel tuning results from the current folder if there are any. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel. Here are all the tuners included in the `make alltuners` target (in the same order) with all their precision arguments: ./clblast_tuner_copy_fast -precision 32 ./clblast_tuner_copy_fast -precision 64 ./clblast_tuner_copy_fast -precision 3232 ./clblast_tuner_copy_fast -precision 6464 ./clblast_tuner_copy_fast -precision 16 ./clblast_tuner_copy_pad -precision 32 ./clblast_tuner_copy_pad -precision 64 ./clblast_tuner_copy_pad -precision 3232 ./clblast_tuner_copy_pad -precision 6464 ./clblast_tuner_copy_pad -precision 16 ./clblast_tuner_transpose_fast -precision 32 ./clblast_tuner_transpose_fast -precision 64 ./clblast_tuner_transpose_fast -precision 3232 ./clblast_tuner_transpose_fast -precision 6464 ./clblast_tuner_transpose_fast -precision 16 ./clblast_tuner_transpose_pad -precision 32 ./clblast_tuner_transpose_pad -precision 64 ./clblast_tuner_transpose_pad -precision 3232 ./clblast_tuner_transpose_pad -precision 6464 ./clblast_tuner_transpose_pad -precision 16 ./clblast_tuner_xaxpy -precision 32 ./clblast_tuner_xaxpy -precision 64 ./clblast_tuner_xaxpy -precision 3232 ./clblast_tuner_xaxpy -precision 6464 ./clblast_tuner_xaxpy -precision 16 ./clblast_tuner_xdot -precision 32 ./clblast_tuner_xdot -precision 64 ./clblast_tuner_xdot -precision 3232 ./clblast_tuner_xdot -precision 6464 ./clblast_tuner_xdot -precision 16 ./clblast_tuner_xger -precision 32 ./clblast_tuner_xger -precision 64 ./clblast_tuner_xger -precision 3232 ./clblast_tuner_xger -precision 6464 ./clblast_tuner_xger -precision 16 ./clblast_tuner_xgemm -precision 32 ./clblast_tuner_xgemm -precision 64 ./clblast_tuner_xgemm -precision 3232 ./clblast_tuner_xgemm -precision 6464 ./clblast_tuner_xgemm -precision 16 ./clblast_tuner_xgemm_direct -precision 32 ./clblast_tuner_xgemm_direct -precision 64 ./clblast_tuner_xgemm_direct -precision 3232 ./clblast_tuner_xgemm_direct -precision 6464 ./clblast_tuner_xgemm_direct -precision 16 ./clblast_tuner_xgemv -precision 32 ./clblast_tuner_xgemv -precision 64 ./clblast_tuner_xgemv -precision 3232 ./clblast_tuner_xgemv -precision 6464 ./clblast_tuner_xgemv -precision 16 ./clblast_tuner_invert -precision 32 ./clblast_tuner_invert -precision 64 ./clblast_tuner_invert -precision 3232 ./clblast_tuner_invert -precision 6464 ./clblast_tuner_invert -precision 16 ./clblast_tuner_routine_xgemm -precision 32 ./clblast_tuner_routine_xgemm -precision 64 ./clblast_tuner_routine_xgemm -precision 3232 ./clblast_tuner_routine_xgemm -precision 6464 ./clblast_tuner_routine_xgemm -precision 16 ./clblast_tuner_routine_xtrsv -precision 32 ./clblast_tuner_routine_xtrsv -precision 64 ./clblast_tuner_routine_xtrsv -precision 3232 ./clblast_tuner_routine_xtrsv -precision 6464 ./clblast_tuner_routine_xtrsv -precision 16 Using the tuning results ------------- The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python (2.7 or 3.x) script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files [to the corresponding issue](https://github.com/CNugteren/CLBlast/issues/1) on GitHub or [email the main author](http://www.cedricnugteren.nl). In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder): mkdir build cd build cmake -DTUNERS=ON .. make make alltuners python ../scripts/database/database.py . .. make Tuning using the API (advanced users only) ------------- Apart from running the tuning binaries, it is also possible to run the tuners programmatically through the CLBlast API. This could be useful if you want to tune for non-standard arguments (e.g. a rectangular or very small matrix). The tuning results can then also be set programmatically using `OverrideParameters`. The tuning API does not perform any disk or stdout I/O, thus it is not possible to track progress. Running the regular tuner binaries should give an idea of the amount of configurations to explore for a particular device, thus giving an indication of a good value for the `fraction` argument (see the [API documentation](api.md) for more details). Inspecting and changing tuning parameters at run-time ------------- Alternatively, you can also supply your tuning parameters programmatically through the CLBlast API. This is especially useful if you tune for specific non-standard arguments (e.g. a rectangular or a very small matrix). To do so, you can call the `OverrideParameters` function which will set new parameters for a specific kernel. At the first next call of the target routine, CLBlast will compile a new binary and use it together with the new parameters from then on. Until `OverrideParameters` is called again of course. This is the API: StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name, const Precision precision, const std::unordered_map ¶meters) To inspect current behaviour, you can also retrieve the parameters for a specific device and kernel combination: StatusCode PUBLIC_API RetrieveParameters(const cl_device_id device, const std::string &kernel_name, const Precision precision, std::unordered_map ¶meters) These two functions require/retrieve the parameters as given in [src/database/kernels](../src/database/kernels), i.e.: | Kernel name | Parameters | | --------------------|-----------------------| | Xaxpy | VW, WGS, WPT | | Xdot | WGS1, WGS2 | | Xgemv | WGS1, WPT1 | | XgemvFast | VW2, WGS2, WPT2 | | XgemvFastRot | VW3, WGS3, WPT3 | | Xger | WGS1, WGS2, WPT | | Xtrsv | TRSV_BLOCK_SIZE | | Xgemm | GEMMK, KREG, KWG, KWI, MDIMA, MDIMC, MWG, NDIMB, NDIMC, NWG, SA, SB, STRM, STRN, VWM, VWN | | XgemmDirect | KWID, MDIMAD, MDIMCD, NDIMBD, NDIMCD, PADA, PADB, VWMD, VWND, WGD | | Copy | COPY_DIMX, COPY_DIMY, COPY_VW, COPY_WPT | | Pad | PAD_DIMX, PAD_DIMY, PAD_WPTX, PAD_WPTY | | Transpose | TRA_DIM, TRA_PAD, TRA_SHUFFLE, TRA_WPT | | Padtranspose | PADTRA_PAD, PADTRA_TILE, PADTRA_WPT | | Invert | INTERNAL_BLOCK_SIZE | | TrsvRoutine | TRSV_BLOCK_SIZE | Tuning OpenCL compiler options ------------- For all of CLBlast's APIs, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler. Also make sure this is set in the same way when running the tuners. Which kernels are used for which routines? ------------- To find out which tuners to run for which routines, you can use the table below. The kernel names correspond to the tuner binaries, the tuner API, and to the arguments for `OverrideParameters` and `RetrieveParameters`. | Routines | Kernel(s) / Tuner(s) | | -------------------------------------------------------------------------|---------------------------------| | AXPY COPY SCAL SWAP OMATCOPY AXPYBATCHED | Xaxpy | | AMAX ASUM DOT DOTC DOTU NRM2 SUM MAX MIN AMIN | Xdot | | GBMV GEMV HBMV HEMV HPMV SBMV SPMV SYMV TMBV TPMV TRMV TRSV | Xgemv | | GER GERC GERU HER HER2 HPR HPR2 SPR SPR2 SYR SYR2 | Xger | | GEMM HEMM HER2K HERK SYMM SYR2K SYRK TRMM GEMMBATCHED GEMMSTRIDEDBATCHED | Xgemm XgemmDirect Copy Pad Transpose Padtranspose | | TRSM | Xgemm XgemmDirect Copy Pad Transpose Padtranspose Invert | | IM2COL COL2IM | Copy | A note on clock frequencies for tuning ------------- You should consider limiting the clock speeds of your processors before performing the tuning. Some examples are given below. To set the CPU frequency on a Linux machine: ``` sudo cpupower frequency-set -g performance sudo cpupower frequency-set -u 3100 ``` To set the NVIDIA GPU frequency on a Linux machine: ``` sudo nvidia-smi -i -lgc ``` You can get the possible frequencies for your NVIDIA GPU using the following command: ``` sudo nvidia-smi -i --query-supported-clocks=gr --format=csv ``` The suggestion is to pick a clock speed that would be stable. Somewhere in the middle of the range of frequencies listed above. CLBlast-1.6.3/include/000077500000000000000000000000001463263031500144335ustar00rootroot00000000000000CLBlast-1.6.3/include/clblast.h000066400000000000000000001240231463263031500162320ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the interface to the CLBlast BLAS routines. It also contains the definitions // of the returned status codes and the layout and transpose types. This is the only header users // of CLBlast should include and use. // // ================================================================================================= #ifndef CLBLAST_CLBLAST_H_ #define CLBLAST_CLBLAST_H_ #include // For size_t #include // For OverrideParameters function #include // For OverrideParameters function // Includes the normal OpenCL C header #if defined(__APPLE__) || defined(__MACOSX) #include #else #include #endif // Exports library functions under Windows when building a DLL. See also: // https://msdn.microsoft.com/en-us/library/a90k134d.aspx #if defined(_WIN32) && defined(CLBLAST_DLL) #if defined(COMPILING_DLL) #define PUBLIC_API __declspec(dllexport) #else #define PUBLIC_API __declspec(dllimport) #endif #else #define PUBLIC_API #endif // Version numbering (v1.6.3) #define CLBLAST_VERSION_MAJOR 1 #define CLBLAST_VERSION_MINOR 6 #define CLBLAST_VERSION_PATCH 3 namespace clblast { // ================================================================================================= // Status codes. These codes can be returned by functions declared in this header file. The error // codes match either the standard OpenCL error codes or the clBLAS error codes. enum class StatusCode { // Status codes in common with the OpenCL standard kSuccess = 0, // CL_SUCCESS kOpenCLCompilerNotAvailable= -3, // CL_COMPILER_NOT_AVAILABLE kTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE kOpenCLOutOfResources = -5, // CL_OUT_OF_RESOURCES kOpenCLOutOfHostMemory = -6, // CL_OUT_OF_HOST_MEMORY kOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error kInvalidValue = -30, // CL_INVALID_VALUE kInvalidCommandQueue = -36, // CL_INVALID_COMMAND_QUEUE kInvalidMemObject = -38, // CL_INVALID_MEM_OBJECT kInvalidBinary = -42, // CL_INVALID_BINARY kInvalidBuildOptions = -43, // CL_INVALID_BUILD_OPTIONS kInvalidProgram = -44, // CL_INVALID_PROGRAM kInvalidProgramExecutable = -45, // CL_INVALID_PROGRAM_EXECUTABLE kInvalidKernelName = -46, // CL_INVALID_KERNEL_NAME kInvalidKernelDefinition = -47, // CL_INVALID_KERNEL_DEFINITION kInvalidKernel = -48, // CL_INVALID_KERNEL kInvalidArgIndex = -49, // CL_INVALID_ARG_INDEX kInvalidArgValue = -50, // CL_INVALID_ARG_VALUE kInvalidArgSize = -51, // CL_INVALID_ARG_SIZE kInvalidKernelArgs = -52, // CL_INVALID_KERNEL_ARGS kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension kInvalidGlobalOffset = -56, // CL_INVALID_GLOBAL_OFFSET kInvalidEventWaitList = -57, // CL_INVALID_EVENT_WAIT_LIST kInvalidEvent = -58, // CL_INVALID_EVENT kInvalidOperation = -59, // CL_INVALID_OPERATION kInvalidBufferSize = -61, // CL_INVALID_BUFFER_SIZE kInvalidGlobalWorkSize = -63, // CL_INVALID_GLOBAL_WORK_SIZE // Status codes in common with the clBLAS library kNotImplemented = -1024, // Routine or functionality not implemented yet kInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer kInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer kInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer kInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer kInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer kInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero kInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension kInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension kInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension kInvalidIncrementX = -1013, // Increment of vector X cannot be zero kInvalidIncrementY = -1012, // Increment of vector Y cannot be zero kInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small kInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small kInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small kInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small // Custom additional status codes for CLBlast kInsufficientMemoryTemp = -2050, // Temporary buffer provided to GEMM routine is too small kInvalidBatchCount = -2049, // The batch count needs to be positive kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small kDatabaseError = -2041, // Entry for the device was not found in the database kUnknownError = -2040, // A catch-all error code representing an unspecified error kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception }; // Matrix layout and transpose types enum class Layout { kRowMajor = 101, kColMajor = 102 }; enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 }; enum class Triangle { kUpper = 121, kLower = 122 }; enum class Diagonal { kNonUnit = 131, kUnit = 132 }; enum class Side { kLeft = 141, kRight = 142 }; enum class KernelMode { kCrossCorrelation = 151, kConvolution = 152 }; // Precision scoped enum (values in bits) enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64, kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 }; // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // Generate givens plane rotation: SROTG/DROTG template StatusCode Rotg(cl_mem sa_buffer, const size_t sa_offset, cl_mem sb_buffer, const size_t sb_offset, cl_mem sc_buffer, const size_t sc_offset, cl_mem ss_buffer, const size_t ss_offset, cl_command_queue* queue, cl_event* event = nullptr); // Generate modified givens plane rotation: SROTMG/DROTMG template StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event = nullptr); // Apply givens plane rotation: SROT/DROT template StatusCode Rot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const T cos, const T sin, cl_command_queue* queue, cl_event* event = nullptr); // Apply modified givens plane rotation: SROTM/DROTM template StatusCode Rotm(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event = nullptr); // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template StatusCode Scal(const size_t n, const T alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template StatusCode Axpy(const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Dot product of two vectors: SDOT/DDOT/HDOT template StatusCode Dot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Dot product of two complex vectors: CDOTU/ZDOTU template StatusCode Dotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template StatusCode Dotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template StatusCode Nrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template StatusCode Asum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template StatusCode Sum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN template StatusCode Amin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template StatusCode Min(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template StatusCode Hemv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template StatusCode Hbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template StatusCode Hpmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template StatusCode Sbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template StatusCode Spmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV template StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); // General rank-1 matrix update: SGER/DGER/HGER template StatusCode Ger(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); // General rank-1 complex matrix update: CGERU/ZGERU template StatusCode Geru(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); // General rank-1 complex conjugated matrix update: CGERC/ZGERC template StatusCode Gerc(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); // Hermitian rank-1 matrix update: CHER/ZHER template StatusCode Her(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); // Hermitian packed rank-1 matrix update: CHPR/ZHPR template StatusCode Hpr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event = nullptr); // Hermitian rank-2 matrix update: CHER2/ZHER2 template StatusCode Her2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 template StatusCode Hpr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event = nullptr); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template StatusCode Syr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template StatusCode Spr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event = nullptr); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template StatusCode Syr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template StatusCode Spr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event = nullptr); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr, cl_mem temp_buffer = nullptr); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); // Rank-K update of a hermitian matrix: CHERK/ZHERK template StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event = nullptr); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event = nullptr); // ================================================================================================= // Extra non-BLAS routines (level-X) // ================================================================================================= // Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD template StatusCode Had(const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const T beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event = nullptr); // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY template StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event = nullptr); // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL template StatusCode Im2col(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event = nullptr); // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM template StatusCode Col2im(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event = nullptr); // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM template StatusCode Convgemm(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event = nullptr); // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template StatusCode AxpyBatched(const size_t n, const T *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event = nullptr); // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED template StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const T *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event = nullptr); // StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED template StatusCode GemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event = nullptr); // ================================================================================================= // Retrieves the required size of the temporary buffer for the GEMM kernel (optional) template StatusCode GemmTempBufferSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t& temp_buffer_size); // ================================================================================================= // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on // for the same device. This cache can be cleared to free up system memory or in case of debugging. StatusCode PUBLIC_API ClearCache(); // The cache can also be pre-initialized for a specific device with all possible CLBlast kernels. // Further CLBlast routine calls will then run at maximum speed. StatusCode PUBLIC_API FillCache(const cl_device_id device); // ================================================================================================= // Retrieves current tuning parameters for a specific device-precision-kernel combination StatusCode PUBLIC_API RetrieveParameters(const cl_device_id device, const std::string &kernel_name, const Precision precision, std::unordered_map ¶meters); // Overrides tuning parameters for a specific device-precision-kernel combination. The next time // the target routine is called it will re-compile and use the new parameters from then on. StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name, const Precision precision, const std::unordered_map ¶meters); // ================================================================================================= // Tunes the "Xaxpy" kernel, used for many level-1 routines such as XAXPY, XCOPY, and XSWAP template StatusCode TuneXaxpy(cl_command_queue* queue, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Xdot" kernel, used for level-1 reduction routines such as XDOT, XMAX, and XSUM template StatusCode TuneXdot(cl_command_queue* queue, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Xgemv" kernel, used for matrix-vector level-2 routines such as XGEMV, XGBMV, and XHEMV template StatusCode TuneXgemv(cl_command_queue* queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Xger" kernel, used for matrix update level-2 routines such as XGER, XHER, and XSYR2 template StatusCode TuneXger(cl_command_queue* queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Xgemm" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K template StatusCode TuneXgemm(cl_command_queue* queue, const size_t m, const size_t n, const size_t k, const double fraction, std::unordered_map ¶meters); // Tunes the "XgemmDiret" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K template StatusCode TuneXgemmDirect(cl_command_queue* queue, const size_t m, const size_t n, const size_t k, const double fraction, std::unordered_map ¶meters); // Tunes the "Copy" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K template StatusCode TuneCopy(cl_command_queue* queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Pad" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K template StatusCode TunePad(cl_command_queue* queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Transpose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K template StatusCode TuneTranspose(cl_command_queue* queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Padtranspose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K template StatusCode TunePadtranspose(cl_command_queue* queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters); // Tunes the "Xgemm" kernel, used for the level-3 routine XTRSM template StatusCode TuneInvert(cl_command_queue* queue, const size_t m, const size_t n, const size_t k, const double fraction, std::unordered_map ¶meters); // ================================================================================================= } // namespace clblast // CLBLAST_CLBLAST_H_ #endif CLBlast-1.6.3/include/clblast_c.h000066400000000000000000004361351463263031500165460ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the plain C interface to the CLBlast BLAS routines, the counter-part of the // normal 'clblast.h' C++ header. // // ================================================================================================= #ifndef CLBLAST_CLBLAST_C_H_ #define CLBLAST_CLBLAST_C_H_ // Includes the normal OpenCL C header #if defined(__APPLE__) || defined(__MACOSX) #include #else #include #endif // Exports library functions under Windows when building a DLL. See also: // https://msdn.microsoft.com/en-us/library/a90k134d.aspx #if defined(_WIN32) && defined(CLBLAST_DLL) #if defined(COMPILING_DLL) #define PUBLIC_API __declspec(dllexport) #else #define PUBLIC_API __declspec(dllimport) #endif #else #define PUBLIC_API #endif // Version numbering (v1.6.3) #define CLBLAST_VERSION_MAJOR 1 #define CLBLAST_VERSION_MINOR 6 #define CLBLAST_VERSION_PATCH 3 // The C interface #ifdef __cplusplus extern "C" { #endif // ================================================================================================= // Status codes. These codes can be returned by functions declared in this header file. The error // codes match either the standard OpenCL error codes or the clBLAS error codes. typedef enum CLBlastStatusCode_ { // Status codes in common with the OpenCL standard CLBlastSuccess = 0, // CL_SUCCESS CLBlastOpenCLCompilerNotAvailable= -3, // CL_COMPILER_NOT_AVAILABLE CLBlastTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE CLBlastOpenCLOutOfResources = -5, // CL_OUT_OF_RESOURCES CLBlastOpenCLOutOfHostMemory = -6, // CL_OUT_OF_HOST_MEMORY CLBlastOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error CLBlastInvalidValue = -30, // CL_INVALID_VALUE CLBlastInvalidCommandQueue = -36, // CL_INVALID_COMMAND_QUEUE CLBlastInvalidMemObject = -38, // CL_INVALID_MEM_OBJECT CLBlastInvalidBinary = -42, // CL_INVALID_BINARY CLBlastInvalidBuildOptions = -43, // CL_INVALID_BUILD_OPTIONS CLBlastInvalidProgram = -44, // CL_INVALID_PROGRAM CLBlastInvalidProgramExecutable = -45, // CL_INVALID_PROGRAM_EXECUTABLE CLBlastInvalidKernelName = -46, // CL_INVALID_KERNEL_NAME CLBlastInvalidKernelDefinition = -47, // CL_INVALID_KERNEL_DEFINITION CLBlastInvalidKernel = -48, // CL_INVALID_KERNEL CLBlastInvalidArgIndex = -49, // CL_INVALID_ARG_INDEX CLBlastInvalidArgValue = -50, // CL_INVALID_ARG_VALUE CLBlastInvalidArgSize = -51, // CL_INVALID_ARG_SIZE CLBlastInvalidKernelArgs = -52, // CL_INVALID_KERNEL_ARGS CLBlastInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions CLBlastInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total CLBlastInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension CLBlastInvalidGlobalOffset = -56, // CL_INVALID_GLOBAL_OFFSET CLBlastInvalidEventWaitList = -57, // CL_INVALID_EVENT_WAIT_LIST CLBlastInvalidEvent = -58, // CL_INVALID_EVENT CLBlastInvalidOperation = -59, // CL_INVALID_OPERATION CLBlastInvalidBufferSize = -61, // CL_INVALID_BUFFER_SIZE CLBlastInvalidGlobalWorkSize = -63, // CL_INVALID_GLOBAL_WORK_SIZE // Status codes in common with the clBLAS library CLBlastNotImplemented = -1024, // Routine or functionality not implemented yet CLBlastInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer CLBlastInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer CLBlastInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer CLBlastInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer CLBlastInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer CLBlastInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero CLBlastInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension CLBlastInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension CLBlastInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension CLBlastInvalidIncrementX = -1013, // Increment of vector X cannot be zero CLBlastInvalidIncrementY = -1012, // Increment of vector Y cannot be zero CLBlastInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small CLBlastInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small CLBlastInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small CLBlastInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small CLBlastInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small // Custom additional status codes for CLBlast CLBlastInsufficientMemoryTemp = -2050, // Temporary buffer provided to GEMM routine is too small CLBlastInvalidBatchCount = -2049, // The batch count needs to be positive CLBlastInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel CLBlastMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel CLBlastInvalidLocalMemUsage = -2046, // Not enough local memory available on this device CLBlastNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device CLBlastNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device CLBlastInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer CLBlastInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small CLBlastDatabaseError = -2041, // Entry for the device was not found in the database CLBlastUnknownError = -2040, // A catch-all error code representing an unspecified error CLBlastUnexpectedError = -2039, // A catch-all error code representing an unexpected exception } CLBlastStatusCode; // Matrix layout and transpose types typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101, CLBlastLayoutColMajor = 102 } CLBlastLayout; typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112, CLBlastTransposeConjugate = 113 } CLBlastTranspose; typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121, CLBlastTriangleLower = 122 } CLBlastTriangle; typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, CLBlastDiagonalUnit = 132 } CLBlastDiagonal; typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; typedef enum CLBlastKernelMode_ { CLBlastKernelModeCrossCorrelation = 151, CLBlastKernelModeConvolution = 152 } CLBlastKernelMode; // Precision enum (values in bits) typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32, CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232, CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision; // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // Generate givens plane rotation: SROTG/DROTG CLBlastStatusCode PUBLIC_API CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset, cl_mem sb_buffer, const size_t sb_offset, cl_mem sc_buffer, const size_t sc_offset, cl_mem ss_buffer, const size_t ss_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, cl_mem sb_buffer, const size_t sb_offset, cl_mem sc_buffer, const size_t sc_offset, cl_mem ss_buffer, const size_t ss_offset, cl_command_queue* queue, cl_event* event); // Generate modified givens plane rotation: SROTMG/DROTMG CLBlastStatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event); // Apply givens plane rotation: SROT/DROT CLBlastStatusCode PUBLIC_API CLBlastSrot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const float cos, const float sin, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDrot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const double cos, const double sin, cl_command_queue* queue, cl_event* event); // Apply modified givens plane rotation: SROTM/DROTM CLBlastStatusCode PUBLIC_API CLBlastSrotm(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDrotm(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event); // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP CLBlastStatusCode PUBLIC_API CLBlastSswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL CLBlastStatusCode PUBLIC_API CLBlastSscal(const size_t n, const float alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDscal(const size_t n, const double alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCscal(const size_t n, const cl_float2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZscal(const size_t n, const cl_double2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHscal(const size_t n, const cl_half alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY CLBlastStatusCode PUBLIC_API CLBlastScopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY CLBlastStatusCode PUBLIC_API CLBlastSaxpy(const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDaxpy(const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCaxpy(const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZaxpy(const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHaxpy(const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Dot product of two vectors: SDOT/DDOT/HDOT CLBlastStatusCode PUBLIC_API CLBlastSdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Dot product of two complex vectors: CDOTU/ZDOTU CLBlastStatusCode PUBLIC_API CLBlastCdotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZdotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC CLBlastStatusCode PUBLIC_API CLBlastCdotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZdotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 CLBlastStatusCode PUBLIC_API CLBlastSnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastScnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDznrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM CLBlastStatusCode PUBLIC_API CLBlastSasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastScasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDzasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM CLBlastStatusCode PUBLIC_API CLBlastSsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastScsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDzsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX CLBlastStatusCode PUBLIC_API CLBlastiSamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiDamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiCamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiZamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiHamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN CLBlastStatusCode PUBLIC_API CLBlastiSamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiDamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiCamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiZamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiHamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX CLBlastStatusCode PUBLIC_API CLBlastiSmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiDmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiCmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiZmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiHmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN CLBlastStatusCode PUBLIC_API CLBlastiSmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiDmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiCmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiZmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastiHmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV CLBlastStatusCode PUBLIC_API CLBlastSgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV CLBlastStatusCode PUBLIC_API CLBlastSgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV CLBlastStatusCode PUBLIC_API CLBlastChemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV CLBlastStatusCode PUBLIC_API CLBlastChbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV CLBlastStatusCode PUBLIC_API CLBlastChpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV CLBlastStatusCode PUBLIC_API CLBlastSsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV CLBlastStatusCode PUBLIC_API CLBlastSsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV CLBlastStatusCode PUBLIC_API CLBlastSspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV CLBlastStatusCode PUBLIC_API CLBlastStrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV CLBlastStatusCode PUBLIC_API CLBlastStbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV CLBlastStatusCode PUBLIC_API CLBlastStpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV CLBlastStatusCode PUBLIC_API CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV CLBlastStatusCode PUBLIC_API CLBlastStbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV CLBlastStatusCode PUBLIC_API CLBlastStpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); // General rank-1 matrix update: SGER/DGER/HGER CLBlastStatusCode PUBLIC_API CLBlastSger(const CLBlastLayout layout, const size_t m, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDger(const CLBlastLayout layout, const size_t m, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHger(const CLBlastLayout layout, const size_t m, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); // General rank-1 complex matrix update: CGERU/ZGERU CLBlastStatusCode PUBLIC_API CLBlastCgeru(const CLBlastLayout layout, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZgeru(const CLBlastLayout layout, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); // General rank-1 complex conjugated matrix update: CGERC/ZGERC CLBlastStatusCode PUBLIC_API CLBlastCgerc(const CLBlastLayout layout, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZgerc(const CLBlastLayout layout, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); // Hermitian rank-1 matrix update: CHER/ZHER CLBlastStatusCode PUBLIC_API CLBlastCher(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZher(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); // Hermitian packed rank-1 matrix update: CHPR/ZHPR CLBlastStatusCode PUBLIC_API CLBlastChpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); // Hermitian rank-2 matrix update: CHER2/ZHER2 CLBlastStatusCode PUBLIC_API CLBlastCher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 CLBlastStatusCode PUBLIC_API CLBlastChpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR CLBlastStatusCode PUBLIC_API CLBlastSsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR CLBlastStatusCode PUBLIC_API CLBlastSspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 CLBlastStatusCode PUBLIC_API CLBlastSsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 CLBlastStatusCode PUBLIC_API CLBlastSspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM CLBlastStatusCode PUBLIC_API CLBlastSgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM CLBlastStatusCode PUBLIC_API CLBlastSsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM CLBlastStatusCode PUBLIC_API CLBlastChemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK CLBlastStatusCode PUBLIC_API CLBlastSsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); // Rank-K update of a hermitian matrix: CHERK/ZHERK CLBlastStatusCode PUBLIC_API CLBlastCherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K CLBlastStatusCode PUBLIC_API CLBlastSsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K CLBlastStatusCode PUBLIC_API CLBlastCher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM CLBlastStatusCode PUBLIC_API CLBlastStrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); // ================================================================================================= // Extra non-BLAS routines (level-X) // ================================================================================================= // Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD CLBlastStatusCode PUBLIC_API CLBlastShad(const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const float beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDhad(const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const double beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastChad(const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const cl_float2 beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZhad(const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const cl_double2 beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHhad(const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const cl_half beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event); // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY CLBlastStatusCode PUBLIC_API CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastComatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL CLBlastStatusCode PUBLIC_API CLBlastSim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event); // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM CLBlastStatusCode PUBLIC_API CLBlastScol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDcol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCcol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZcol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHcol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event); // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM CLBlastStatusCode PUBLIC_API CLBlastSconvgemm(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDconvgemm(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHconvgemm(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event); // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n, const float *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n, const double *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n, const cl_float2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n, const cl_double2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n, const cl_half *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event); // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED CLBlastStatusCode PUBLIC_API CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const float *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const double *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_float2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_double2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_half *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event); // StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED CLBlastStatusCode PUBLIC_API CLBlastSgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event); // ================================================================================================= // General matrix-matrix multiplication with temporary buffer from user (optional, for advanced users): SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM CLBlastStatusCode PUBLIC_API CLBlastSgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer); CLBlastStatusCode PUBLIC_API CLBlastDgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer); CLBlastStatusCode PUBLIC_API CLBlastCgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer); CLBlastStatusCode PUBLIC_API CLBlastZgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer); CLBlastStatusCode PUBLIC_API CLBlastHgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer); // ================================================================================================= // Retrieves the required size of the temporary buffer for the GEMM kernel: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM (optional) CLBlastStatusCode PUBLIC_API CLBlastSGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size); CLBlastStatusCode PUBLIC_API CLBlastDGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size); CLBlastStatusCode PUBLIC_API CLBlastCGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size); CLBlastStatusCode PUBLIC_API CLBlastZGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size); CLBlastStatusCode PUBLIC_API CLBlastHGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size); // ================================================================================================= // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on // for the same device. This cache can be cleared to free up system memory or in case of debugging. CLBlastStatusCode PUBLIC_API CLBlastClearCache(); // The cache can also be pre-initialized for a specific device with all possible CLBlast kernels. // Further CLBlast routine calls will then run at maximum speed. CLBlastStatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device); // ================================================================================================= // Overrides tuning parameters for a specific device-precision-kernel combination. The next time // the target routine is called it will re-compile and use the new parameters from then on. CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name, const CLBlastPrecision precision, const size_t num_parameters, const char** parameters_names, const size_t* parameters_values); // ================================================================================================= #ifdef __cplusplus } // extern "C" #endif // CLBLAST_CLBLAST_C_H_ #endif CLBlast-1.6.3/include/clblast_cuda.h000066400000000000000000001131341463263031500172270ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the special CUDA interface to the CLBlast BLAS routines. It also contains the // definitions of the returned status codes and the layout and transpose types. This is the header // users of the CUDA API of CLBlast should include and use. // // ================================================================================================= #ifndef CLBLAST_CLBLAST_CUDA_H_ #define CLBLAST_CLBLAST_CUDA_H_ #include // For size_t #include // For OverrideParameters function #include // For OverrideParameters function // CUDA #include // CUDA driver API #include // NVIDIA runtime compilation API // Exports library functions under Windows when building a DLL. See also: // https://msdn.microsoft.com/en-us/library/a90k134d.aspx #if defined(_WIN32) && defined(CLBLAST_DLL) #if defined(COMPILING_DLL) #define PUBLIC_API __declspec(dllexport) #else #define PUBLIC_API __declspec(dllimport) #endif #else #define PUBLIC_API #endif namespace clblast { // ================================================================================================= // Status codes. These codes can be returned by functions declared in this header file. The error // codes match either the standard CUDA driver API error codes or the regular CLBlast error codes. enum class StatusCode { // Status codes in common with the OpenCL standard kSuccess = 0, // CUDA_SUCCESS kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension // Status codes in common with the clBLAS library kNotImplemented = -1024, // Routine or functionality not implemented yet kInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer kInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer kInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer kInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer kInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer kInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero kInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension kInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension kInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension kInvalidIncrementX = -1013, // Increment of vector X cannot be zero kInvalidIncrementY = -1012, // Increment of vector Y cannot be zero kInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small kInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small kInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small kInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small // Custom additional status codes for CLBlast kInsufficientMemoryTemp = -2050, // Temporary buffer provided to GEMM routine is too small kInvalidBatchCount = -2049, // The batch count needs to be positive kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small kDatabaseError = -2041, // Entry for the device was not found in the database kUnknownError = -2040, // A catch-all error code representing an unspecified error kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception }; // Matrix layout and transpose types enum class Layout { kRowMajor = 101, kColMajor = 102 }; enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 }; enum class Triangle { kUpper = 121, kLower = 122 }; enum class Diagonal { kNonUnit = 131, kUnit = 132 }; enum class Side { kLeft = 141, kRight = 142 }; enum class KernelMode { kCrossCorrelation = 151, kConvolution = 152 }; // Precision scoped enum (values in bits) enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64, kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 }; // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // Generate givens plane rotation: SROTG/DROTG template StatusCode Rotg(CUdeviceptr sa_buffer, const size_t sa_offset, CUdeviceptr sb_buffer, const size_t sb_offset, CUdeviceptr sc_buffer, const size_t sc_offset, CUdeviceptr ss_buffer, const size_t ss_offset, const CUcontext context, const CUdevice device); // Generate modified givens plane rotation: SROTMG/DROTMG template StatusCode Rotmg(CUdeviceptr sd1_buffer, const size_t sd1_offset, CUdeviceptr sd2_buffer, const size_t sd2_offset, CUdeviceptr sx1_buffer, const size_t sx1_offset, const CUdeviceptr sy1_buffer, const size_t sy1_offset, CUdeviceptr sparam_buffer, const size_t sparam_offset, const CUcontext context, const CUdevice device); // Apply givens plane rotation: SROT/DROT template StatusCode Rot(const size_t n, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const T cos, const T sin, const CUcontext context, const CUdevice device); // Apply modified givens plane rotation: SROTM/DROTM template StatusCode Rotm(const size_t n, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr sparam_buffer, const size_t sparam_offset, const CUcontext context, const CUdevice device); // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template StatusCode Swap(const size_t n, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template StatusCode Scal(const size_t n, const T alpha, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template StatusCode Copy(const size_t n, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template StatusCode Axpy(const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Dot product of two vectors: SDOT/DDOT/HDOT template StatusCode Dot(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Dot product of two complex vectors: CDOTU/ZDOTU template StatusCode Dotu(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template StatusCode Dotc(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template StatusCode Nrm2(const size_t n, CUdeviceptr nrm2_buffer, const size_t nrm2_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template StatusCode Asum(const size_t n, CUdeviceptr asum_buffer, const size_t asum_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template StatusCode Sum(const size_t n, CUdeviceptr sum_buffer, const size_t sum_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template StatusCode Amax(const size_t n, CUdeviceptr imax_buffer, const size_t imax_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN template StatusCode Amin(const size_t n, CUdeviceptr imin_buffer, const size_t imin_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template StatusCode Max(const size_t n, CUdeviceptr imax_buffer, const size_t imax_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template StatusCode Min(const size_t n, CUdeviceptr imin_buffer, const size_t imin_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template StatusCode Hemv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template StatusCode Hbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template StatusCode Hpmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr ap_buffer, const size_t ap_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template StatusCode Sbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template StatusCode Spmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr ap_buffer, const size_t ap_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const CUdeviceptr ap_buffer, const size_t ap_offset, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV template StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const CUdeviceptr ap_buffer, const size_t ap_offset, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device); // General rank-1 matrix update: SGER/DGER/HGER template StatusCode Ger(const Layout layout, const size_t m, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device); // General rank-1 complex matrix update: CGERU/ZGERU template StatusCode Geru(const Layout layout, const size_t m, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device); // General rank-1 complex conjugated matrix update: CGERC/ZGERC template StatusCode Gerc(const Layout layout, const size_t m, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device); // Hermitian rank-1 matrix update: CHER/ZHER template StatusCode Her(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device); // Hermitian packed rank-1 matrix update: CHPR/ZHPR template StatusCode Hpr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr ap_buffer, const size_t ap_offset, const CUcontext context, const CUdevice device); // Hermitian rank-2 matrix update: CHER2/ZHER2 template StatusCode Her2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 template StatusCode Hpr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr ap_buffer, const size_t ap_offset, const CUcontext context, const CUdevice device); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template StatusCode Syr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template StatusCode Spr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr ap_buffer, const size_t ap_offset, const CUcontext context, const CUdevice device); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template StatusCode Syr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template StatusCode Spr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr ap_buffer, const size_t ap_offset, const CUcontext context, const CUdevice device); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device, CUdeviceptr temp_buffer = 0); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device); // Rank-K update of a hermitian matrix: CHERK/ZHERK template StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const U beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const CUcontext context, const CUdevice device); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const CUcontext context, const CUdevice device); // ================================================================================================= // Extra non-BLAS routines (level-X) // ================================================================================================= // Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD template StatusCode Had(const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const T beta, CUdeviceptr z_buffer, const size_t z_offset, const size_t z_inc, const CUcontext context, const CUdevice device); // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY template StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const CUcontext context, const CUdevice device); // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL template StatusCode Im2col(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const CUdeviceptr im_buffer, const size_t im_offset, CUdeviceptr col_buffer, const size_t col_offset, const CUcontext context, const CUdevice device); // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM template StatusCode Col2im(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const CUdeviceptr col_buffer, const size_t col_offset, CUdeviceptr im_buffer, const size_t im_offset, const CUcontext context, const CUdevice device); // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM template StatusCode Convgemm(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const CUdeviceptr im_buffer, const size_t im_offset, const CUdeviceptr kernel_buffer, const size_t kernel_offset, CUdeviceptr result_buffer, const size_t result_offset, const CUcontext context, const CUdevice device); // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template StatusCode AxpyBatched(const size_t n, const T *alphas, const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, const CUcontext context, const CUdevice device); // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED template StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T *alphas, const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld, const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld, const T *betas, CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, const CUcontext context, const CUdevice device); // StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED template StatusCode GemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, const CUcontext context, const CUdevice device); // ================================================================================================= // Retrieves the required size of the temporary buffer for the GEMM kernel (optional) template StatusCode GemmTempBufferSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, const CUdevice device, size_t& temp_buffer_size); // ================================================================================================= // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on // for the same device. This cache can be cleared to free up system memory or in case of debugging. StatusCode PUBLIC_API ClearCache(); // The cache can also be pre-initialized for a specific device with all possible CLBlast kernels. // Further CLBlast routine calls will then run at maximum speed. StatusCode PUBLIC_API FillCache(const CUdevice device); // ================================================================================================= // Retrieves current tuning parameters for a specific device-precision-kernel combination StatusCode PUBLIC_API RetrieveParameters(const CUdevice device, const std::string &kernel_name, const Precision precision, std::unordered_map ¶meters); // Overrides tuning parameters for a specific device-precision-kernel combination. The next time // the target routine is called it will re-compile and use the new parameters from then on. StatusCode PUBLIC_API OverrideParameters(const CUdevice device, const std::string &kernel_name, const Precision precision, const std::unordered_map ¶meters); // ================================================================================================= } // namespace clblast // CLBLAST_CLBLAST_CUDA_H_ #endif CLBlast-1.6.3/include/clblast_half.h000066400000000000000000001045001463263031500172220ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file provides simple conversion operations between fp16 (half) and fp32 (float). These // conversion functions are based on ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf and // are also part of the C++ half-precision header (http://half.sourceforge.net/). // // This file is pure C99. // // ================================================================================================= #ifndef CLBLAST_HALF_H_ #define CLBLAST_HALF_H_ // ================================================================================================= // The host data-type for half-precision floating-point (16-bit) is based on the `cl_half` OpenCL // type, which is a typedef for unsigned short. typedef unsigned short half; // 32-bit union for conversions typedef union ConversionBits_ { unsigned int i32; float f32; } ConversionBits; // ================================================================================================= // Converts a IEEE-compliant single-precision value to half-precision floating-point. This function // applies simple truncation (round toward zero, but with overflows set to infinity) as rounding // mode. static half FloatToHalf(const float value) { static const unsigned short base_table[512] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 }; static const unsigned char shift_table[512] = { 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 }; ConversionBits bits; bits.f32 = value; const unsigned short halfbits = base_table[bits.i32 >> 23] + (unsigned short)((bits.i32 & 0x7FFFFF) >> shift_table[bits.i32 >> 23]); return halfbits; } // Converts a half-precision value to IEEE-compliant single-precision floating-point static float HalfToFloat(const half value) { static const unsigned int mantissa_table[2048] = { 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, 0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, 0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, 0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 }; static const unsigned int exponent_table[64] = { 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 }; static const unsigned short offset_table[64] = { 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 }; ConversionBits bits; bits.i32 = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + exponent_table[value >> 10]; return bits.f32; } // ================================================================================================= // CLBLAST_HALF_H_ #endif CLBlast-1.6.3/include/clblast_netlib_c.h000066400000000000000000001574571463263031500201120ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the Netlib CBLAS interface to the CLBlast BLAS routines, performing all buffer // copies automatically and running on the default OpenCL platform and device. For full control over // performance, it is advised to use the regular clblast.h or clblast_c.h headers instead. // // ================================================================================================= #ifndef CLBLAST_CLBLAST_NETLIB_C_H_ #define CLBLAST_CLBLAST_NETLIB_C_H_ // Exports library functions under Windows when building a DLL. See also: // https://msdn.microsoft.com/en-us/library/a90k134d.aspx #if defined(_WIN32) && defined(CLBLAST_DLL) #if defined(COMPILING_DLL) #define PUBLIC_API __declspec(dllexport) #else #define PUBLIC_API __declspec(dllimport) #endif #else #define PUBLIC_API #endif // The C interface #ifdef __cplusplus extern "C" { #endif // ================================================================================================= // Matrix layout and transpose types typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101, CLBlastLayoutColMajor = 102 } CLBlastLayout; typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112, CLBlastTransposeConjugate = 113 } CLBlastTranspose; typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121, CLBlastTriangleLower = 122 } CLBlastTriangle; typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, CLBlastDiagonalUnit = 132 } CLBlastDiagonal; typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; typedef enum CLBlastKernelMode_ { CLBlastKernelModeCrossCorrelation = 141, CLBlastKernelModeConvolution = 152 } CLBlastKernelMode; // For full compatibility with CBLAS typedef CLBlastLayout CBLAS_ORDER; typedef CLBlastTranspose CBLAS_TRANSPOSE; typedef CLBlastTriangle CBLAS_UPLO; typedef CLBlastDiagonal CBLAS_DIAG; typedef CLBlastSide CBLAS_SIDE; #define CblasRowMajor CLBlastLayoutRowMajor #define CblasColMajor CLBlastLayoutColMajor #define CblasNoTrans CLBlastTransposeNo #define CblasTrans CLBlastTransposeYes #define CblasConjTrans CLBlastTransposeConjugate #define CblasUpper CLBlastTriangleUpper #define CblasLower CLBlastTriangleLower #define CblasNonUnit CLBlastDiagonalNonUnit #define CblasUnit CLBlastDiagonalUnit #define CblasLeft CLBlastSideLeft #define CblasRight CLBlastSideRight // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // Generate givens plane rotation: SROTG/DROTG void PUBLIC_API cblas_srotg(float* sa, float* sb, float* sc, float* ss); void PUBLIC_API cblas_drotg(double* sa, double* sb, double* sc, double* ss); // Generate modified givens plane rotation: SROTMG/DROTMG void PUBLIC_API cblas_srotmg(float* sd1, float* sd2, float* sx1, const float sy1, float* sparam); void PUBLIC_API cblas_drotmg(double* sd1, double* sd2, double* sx1, const double sy1, double* sparam); // Apply givens plane rotation: SROT/DROT void PUBLIC_API cblas_srot(const int n, float* x, const int x_inc, float* y, const int y_inc, const float cos, const float sin); void PUBLIC_API cblas_drot(const int n, double* x, const int x_inc, double* y, const int y_inc, const double cos, const double sin); // Apply modified givens plane rotation: SROTM/DROTM void PUBLIC_API cblas_srotm(const int n, float* x, const int x_inc, float* y, const int y_inc, float* sparam); void PUBLIC_API cblas_drotm(const int n, double* x, const int x_inc, double* y, const int y_inc, double* sparam); // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP void PUBLIC_API cblas_sswap(const int n, float* x, const int x_inc, float* y, const int y_inc); void PUBLIC_API cblas_dswap(const int n, double* x, const int x_inc, double* y, const int y_inc); void PUBLIC_API cblas_cswap(const int n, void* x, const int x_inc, void* y, const int y_inc); void PUBLIC_API cblas_zswap(const int n, void* x, const int x_inc, void* y, const int y_inc); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL void PUBLIC_API cblas_sscal(const int n, const float alpha, float* x, const int x_inc); void PUBLIC_API cblas_dscal(const int n, const double alpha, double* x, const int x_inc); void PUBLIC_API cblas_cscal(const int n, const void* alpha, void* x, const int x_inc); void PUBLIC_API cblas_zscal(const int n, const void* alpha, void* x, const int x_inc); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY void PUBLIC_API cblas_scopy(const int n, const float* x, const int x_inc, float* y, const int y_inc); void PUBLIC_API cblas_dcopy(const int n, const double* x, const int x_inc, double* y, const int y_inc); void PUBLIC_API cblas_ccopy(const int n, const void* x, const int x_inc, void* y, const int y_inc); void PUBLIC_API cblas_zcopy(const int n, const void* x, const int x_inc, void* y, const int y_inc); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY void PUBLIC_API cblas_saxpy(const int n, const float alpha, const float* x, const int x_inc, float* y, const int y_inc); void PUBLIC_API cblas_daxpy(const int n, const double alpha, const double* x, const int x_inc, double* y, const int y_inc); void PUBLIC_API cblas_caxpy(const int n, const void* alpha, const void* x, const int x_inc, void* y, const int y_inc); void PUBLIC_API cblas_zaxpy(const int n, const void* alpha, const void* x, const int x_inc, void* y, const int y_inc); // Dot product of two vectors: SDOT/DDOT/HDOT float PUBLIC_API cblas_sdot(const int n, const float* x, const int x_inc, const float* y, const int y_inc); double PUBLIC_API cblas_ddot(const int n, const double* x, const int x_inc, const double* y, const int y_inc); // Dot product of two complex vectors: CDOTU/ZDOTU void PUBLIC_API cblas_cdotu_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot); void PUBLIC_API cblas_zdotu_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC void PUBLIC_API cblas_cdotc_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot); void PUBLIC_API cblas_zdotc_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 float PUBLIC_API cblas_snrm2(const int n, const float* x, const int x_inc); double PUBLIC_API cblas_dnrm2(const int n, const double* x, const int x_inc); float PUBLIC_API cblas_scnrm2(const int n, const void* x, const int x_inc); double PUBLIC_API cblas_dznrm2(const int n, const void* x, const int x_inc); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM float PUBLIC_API cblas_sasum(const int n, const float* x, const int x_inc); double PUBLIC_API cblas_dasum(const int n, const double* x, const int x_inc); float PUBLIC_API cblas_scasum(const int n, const void* x, const int x_inc); double PUBLIC_API cblas_dzasum(const int n, const void* x, const int x_inc); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM float PUBLIC_API cblas_ssum(const int n, const float* x, const int x_inc); double PUBLIC_API cblas_dsum(const int n, const double* x, const int x_inc); float PUBLIC_API cblas_scsum(const int n, const void* x, const int x_inc); double PUBLIC_API cblas_dzsum(const int n, const void* x, const int x_inc); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX int PUBLIC_API cblas_isamax(const int n, const float* x, const int x_inc); int PUBLIC_API cblas_idamax(const int n, const double* x, const int x_inc); int PUBLIC_API cblas_icamax(const int n, const void* x, const int x_inc); int PUBLIC_API cblas_izamax(const int n, const void* x, const int x_inc); // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN int PUBLIC_API cblas_isamin(const int n, const float* x, const int x_inc); int PUBLIC_API cblas_idamin(const int n, const double* x, const int x_inc); int PUBLIC_API cblas_icamin(const int n, const void* x, const int x_inc); int PUBLIC_API cblas_izamin(const int n, const void* x, const int x_inc); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX int PUBLIC_API cblas_ismax(const int n, const float* x, const int x_inc); int PUBLIC_API cblas_idmax(const int n, const double* x, const int x_inc); int PUBLIC_API cblas_icmax(const int n, const void* x, const int x_inc); int PUBLIC_API cblas_izmax(const int n, const void* x, const int x_inc); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN int PUBLIC_API cblas_ismin(const int n, const float* x, const int x_inc); int PUBLIC_API cblas_idmin(const int n, const double* x, const int x_inc); int PUBLIC_API cblas_icmin(const int n, const void* x, const int x_inc); int PUBLIC_API cblas_izmin(const int n, const void* x, const int x_inc); // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV void PUBLIC_API cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc); void PUBLIC_API cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const double alpha, const double* a, const int a_ld, const double* x, const int x_inc, const double beta, double* y, const int y_inc); void PUBLIC_API cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); void PUBLIC_API cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV void PUBLIC_API cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc); void PUBLIC_API cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const double alpha, const double* a, const int a_ld, const double* x, const int x_inc, const double beta, double* y, const int y_inc); void PUBLIC_API cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); void PUBLIC_API cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV void PUBLIC_API cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); void PUBLIC_API cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV void PUBLIC_API cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); void PUBLIC_API cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV void PUBLIC_API cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* ap, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); void PUBLIC_API cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* ap, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV void PUBLIC_API cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc); void PUBLIC_API cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* a, const int a_ld, const double* x, const int x_inc, const double beta, double* y, const int y_inc); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV void PUBLIC_API cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc); void PUBLIC_API cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const double alpha, const double* a, const int a_ld, const double* x, const int x_inc, const double beta, double* y, const int y_inc); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV void PUBLIC_API cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* ap, const float* x, const int x_inc, const float beta, float* y, const int y_inc); void PUBLIC_API cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* ap, const double* x, const int x_inc, const double beta, double* y, const int y_inc); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV void PUBLIC_API cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* a, const int a_ld, float* x, const int x_inc); void PUBLIC_API cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* a, const int a_ld, double* x, const int x_inc); void PUBLIC_API cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc); void PUBLIC_API cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV void PUBLIC_API cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc); void PUBLIC_API cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc); void PUBLIC_API cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); void PUBLIC_API cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV void PUBLIC_API cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* ap, float* x, const int x_inc); void PUBLIC_API cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* ap, double* x, const int x_inc); void PUBLIC_API cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc); void PUBLIC_API cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV void PUBLIC_API cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* a, const int a_ld, float* x, const int x_inc); void PUBLIC_API cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* a, const int a_ld, double* x, const int x_inc); void PUBLIC_API cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc); void PUBLIC_API cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV void PUBLIC_API cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc); void PUBLIC_API cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc); void PUBLIC_API cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); void PUBLIC_API cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV void PUBLIC_API cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* ap, float* x, const int x_inc); void PUBLIC_API cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* ap, double* x, const int x_inc); void PUBLIC_API cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc); void PUBLIC_API cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc); // General rank-1 matrix update: SGER/DGER/HGER void PUBLIC_API cblas_sger(const CLBlastLayout layout, const int m, const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, float* a, const int a_ld); void PUBLIC_API cblas_dger(const CLBlastLayout layout, const int m, const int n, const double alpha, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld); // General rank-1 complex matrix update: CGERU/ZGERU void PUBLIC_API cblas_cgeru(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld); void PUBLIC_API cblas_zgeru(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld); // General rank-1 complex conjugated matrix update: CGERC/ZGERC void PUBLIC_API cblas_cgerc(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld); void PUBLIC_API cblas_zgerc(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld); // Hermitian rank-1 matrix update: CHER/ZHER void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const void* x, const int x_inc, void* a, const int a_ld); void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const void* x, const int x_inc, void* a, const int a_ld); // Hermitian packed rank-1 matrix update: CHPR/ZHPR void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const void* x, const int x_inc, void* ap); void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const void* x, const int x_inc, void* ap); // Hermitian rank-2 matrix update: CHER2/ZHER2 void PUBLIC_API cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld); void PUBLIC_API cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 void PUBLIC_API cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* ap); void PUBLIC_API cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* ap); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR void PUBLIC_API cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, float* a, const int a_ld); void PUBLIC_API cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, double* a, const int a_ld); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR void PUBLIC_API cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, float* ap); void PUBLIC_API cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, double* ap); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 void PUBLIC_API cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, float* a, const int a_ld); void PUBLIC_API cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 void PUBLIC_API cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, float* ap); void PUBLIC_API cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, const double* y, const int y_inc, double* ap); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM void PUBLIC_API cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const float alpha, const float* a, const int a_ld, const float* b, const int b_ld, const float beta, float* c, const int c_ld); void PUBLIC_API cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const double alpha, const double* a, const int a_ld, const double* b, const int b_ld, const double beta, double* c, const int c_ld); void PUBLIC_API cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); void PUBLIC_API cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM void PUBLIC_API cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const float alpha, const float* a, const int a_ld, const float* b, const int b_ld, const float beta, float* c, const int c_ld); void PUBLIC_API cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const double alpha, const double* a, const int a_ld, const double* b, const int b_ld, const double beta, double* c, const int c_ld); void PUBLIC_API cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); void PUBLIC_API cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM void PUBLIC_API cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); void PUBLIC_API cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK void PUBLIC_API cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const float alpha, const float* a, const int a_ld, const float beta, float* c, const int c_ld); void PUBLIC_API cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const double alpha, const double* a, const int a_ld, const double beta, double* c, const int c_ld); void PUBLIC_API cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* beta, void* c, const int c_ld); void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* beta, void* c, const int c_ld); // Rank-K update of a hermitian matrix: CHERK/ZHERK void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const float alpha, const void* a, const int a_ld, const float beta, void* c, const int c_ld); void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const double alpha, const void* a, const int a_ld, const double beta, void* c, const int c_ld); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K void PUBLIC_API cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const float alpha, const float* a, const int a_ld, const float* b, const int b_ld, const float beta, float* c, const int c_ld); void PUBLIC_API cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const double alpha, const double* a, const int a_ld, const double* b, const int b_ld, const double beta, double* c, const int c_ld); void PUBLIC_API cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); void PUBLIC_API cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const float beta, void* c, const int c_ld); void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const double beta, void* c, const int c_ld); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM void PUBLIC_API cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, const float* a, const int a_ld, float* b, const int b_ld); void PUBLIC_API cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const double alpha, const double* a, const int a_ld, double* b, const int b_ld); void PUBLIC_API cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, const float* a, const int a_ld, float* b, const int b_ld); void PUBLIC_API cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const double alpha, const double* a, const int a_ld, double* b, const int b_ld); void PUBLIC_API cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); // ================================================================================================= // Extra non-BLAS routines (level-X) // ================================================================================================= // Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD void PUBLIC_API cblas_shad(const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, const float beta, float* z, const int z_inc); void PUBLIC_API cblas_dhad(const int n, const double alpha, const double* x, const int x_inc, const double* y, const int y_inc, const double beta, double* z, const int z_inc); void PUBLIC_API cblas_chad(const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, const void* beta, void* z, const int z_inc); void PUBLIC_API cblas_zhad(const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, const void* beta, void* z, const int z_inc); // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const float alpha, const float* a, const int a_ld, float* b, const int b_ld); void PUBLIC_API cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const double alpha, const double* a, const int a_ld, double* b, const int b_ld); void PUBLIC_API cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL void PUBLIC_API cblas_sim2col(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const float* im, float* col); void PUBLIC_API cblas_dim2col(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const double* im, double* col); void PUBLIC_API cblas_cim2col(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const void* im, void* col); void PUBLIC_API cblas_zim2col(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const void* im, void* col); // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM void PUBLIC_API cblas_scol2im(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const float* col, float* im); void PUBLIC_API cblas_dcol2im(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const double* col, double* im); void PUBLIC_API cblas_ccol2im(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const void* col, void* im); void PUBLIC_API cblas_zcol2im(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const void* col, void* im); // ================================================================================================= #ifdef __cplusplus } // extern "C" #endif // CLBLAST_CLBLAST_NETLIB_C_H_ #endif CLBlast-1.6.3/samples/000077500000000000000000000000001463263031500144545ustar00rootroot00000000000000CLBlast-1.6.3/samples/cache.c000066400000000000000000000122741463263031500156710ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file demonstrates the CLBlast kernel cache, which stores compiled OpenCL binaries for faster // repeated kernel execution. The cache can be pre-initialized or cleared. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). // // ================================================================================================= #include #include #include #include #define CL_TARGET_OPENCL_VERSION 120 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings // Includes the CLBlast library (C interface) #include // Forward declaration void run_example_routine(const cl_device_id device); // ================================================================================================= // Example use of the CLBlast kernel cache int main(void) { // OpenCL platform/device settings const size_t platform_id = 0; const size_t device_id = 0; // Initializes the OpenCL platform cl_uint num_platforms; clGetPlatformIDs(0, NULL, &num_platforms); cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); clGetPlatformIDs(num_platforms, platforms, NULL); cl_platform_id platform = platforms[platform_id]; // Initializes the OpenCL device cl_uint num_devices; clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); cl_device_id device = devices[device_id]; // Run the routine multiple times in a row: after the first time the binary is already in the // cache and compilation is no longer needed. printf("Starting caching sample with an empty cache\n"); run_example_routine(device); run_example_routine(device); run_example_routine(device); // Clearing the cache makes CLBlast re-compile the kernel once printf("Clearing cache\n"); CLBlastClearCache(); run_example_routine(device); run_example_routine(device); // When the cache is empty, it can be pre-initialized with compiled kernels for all routines by // calling the CLBlastFillCache function, such that all other CLBlast calls can benefit from // pre-compiled kernels and thus execute at maximum speed. printf("Clearing cache\n"); CLBlastClearCache(); printf("Filling cache (this might take a while)\n"); CLBlastFillCache(device); run_example_routine(device); // Clean-up free(platforms); free(devices); return 0; } // ================================================================================================= // Runs an example routine and reports the time void run_example_routine(const cl_device_id device) { // Example SASUM arguments const size_t n = 1024*128; // Creates the OpenCL context, queue, and an event cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); cl_event event = NULL; // Populate host data structures with some example data float* host_input = (float*)malloc(sizeof(float)*n); float* host_output = (float*)malloc(sizeof(float)*1); for (size_t i=0; i success). printf("Completed routine with status %d in %.3lf ms\n", status, time_ms); // Clean-up free(host_input); free(host_output); clReleaseMemObject(device_input); clReleaseMemObject(device_output); clReleaseCommandQueue(queue); clReleaseContext(context); } // ================================================================================================= CLBlast-1.6.3/samples/daxpy_cuda.cpp000066400000000000000000000060251463263031500173040ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file demonstrates the use of the DAXPY routine with the C++ CUDA API of CLBlast. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). // // ================================================================================================= #include #include #include // Includes the CUDA driver API #include // Includes the CLBlast library #include // ================================================================================================= // Example use of the double-precision Xaxpy routine DAXPY int main() { // CUDA device selection const auto device_id = 0; // Example DAXPY arguments const size_t n = 8192; const double alpha = 0.7; // Initializes the OpenCL device cuInit(0); CUdevice device; cuDeviceGet(&device, device_id); // Creates the OpenCL context and stream CUcontext context; cuCtxCreate(&context, 0, device); CUstream stream; cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING); // Populate host matrices with some example data auto host_a = std::vector(n); auto host_b = std::vector(n); for (auto &item: host_a) { item = 12.193; } for (auto &item: host_b) { item = -8.199; } // Copy the matrices to the device CUdeviceptr device_a; CUdeviceptr device_b; cuMemAlloc(&device_a, host_a.size()*sizeof(double)); cuMemAlloc(&device_b, host_b.size()*sizeof(double)); cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(double), stream); cuMemcpyHtoDAsync(device_b, host_b.data(), host_b.size()*sizeof(double), stream); // Start the timer auto start_time = std::chrono::steady_clock::now(); // Call the DAXPY routine. Note that the type of alpha (double) determines the precision. const auto status = clblast::Axpy(n, alpha, device_a, 0, 1, device_b, 0, 1, context, device); cuStreamSynchronize(stream); // Record the execution time auto elapsed_time = std::chrono::steady_clock::now() - start_time; auto time_ms = std::chrono::duration(elapsed_time).count(); // Example completed. See "clblast_cuda.h" for status codes (0 -> success). printf("Completed DAXPY in %.3lf ms with status %d\n", time_ms, static_cast(status)); // Clean-up cuMemFree(device_a); cuMemFree(device_b); cuStreamDestroy(stream); return 0; } // ================================================================================================= CLBlast-1.6.3/samples/dgemv.c000066400000000000000000000106201463263031500157210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file demonstrates the use of the DGEMV routine. It is pure C99 and demonstrates the use of // the C API to the CLBlast library. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). // // ================================================================================================= #include #include #include #define CL_TARGET_OPENCL_VERSION 120 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings // Includes the CLBlast library (C interface) #include // ================================================================================================= // Example use of the double-precision routine DGEMV int main(void) { // OpenCL platform/device settings const size_t platform_id = 0; const size_t device_id = 0; // Example DGEMV arguments const size_t m = 128; const size_t n = 289; const double alpha = 0.7; const double beta = 0.0; const size_t a_ld = n; // Initializes the OpenCL platform cl_uint num_platforms; clGetPlatformIDs(0, NULL, &num_platforms); cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); clGetPlatformIDs(num_platforms, platforms, NULL); cl_platform_id platform = platforms[platform_id]; // Initializes the OpenCL device cl_uint num_devices; clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); cl_device_id device = devices[device_id]; // Creates the OpenCL context, queue, and an event cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); cl_event event = NULL; // Populate host data structures with some example data double* host_a = (double*)malloc(sizeof(double)*m*n); double* host_x = (double*)malloc(sizeof(double)*n); double* host_y = (double*)malloc(sizeof(double)*m); for (size_t i=0; i success). printf("Completed DGEMV with status %d\n", status); // Clean-up free(platforms); free(devices); free(host_a); free(host_x); free(host_y); clReleaseMemObject(device_a); clReleaseMemObject(device_x); clReleaseMemObject(device_y); clReleaseCommandQueue(queue); clReleaseContext(context); return 0; } // ================================================================================================= CLBlast-1.6.3/samples/dtrsm.cpp000066400000000000000000000116331463263031500163150ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file demonstrates the use of the DTRSM routine. It is a stand-alone example, but it does // require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++ // features, but CLBlast can also be used using the regular C-style OpenCL API. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). // // ================================================================================================= #include #include #define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings // Includes the C++ OpenCL API. If not yet available, it can be found here: // https://raw.githubusercontent.com/KhronosGroup/OpenCL-CLHPP/main/include/CL/opencl.hpp #define CL_HPP_TARGET_OPENCL_VERSION 120 #define CL_HPP_MINIMUM_OPENCL_VERSION 120 #define CL_TARGET_OPENCL_VERSION 120 #include "opencl.hpp" // Includes the CLBlast library #include // ================================================================================================= // Example use of the double-precision Xtrsm routine DTRSM, solving A*X = alpha*B, storing the // result in the memory of matrix B. Uses row-major storage (C-style). int main() { // OpenCL platform/device settings const auto platform_id = 0; const auto device_id = 0; // Example TRSM arguments const size_t m = 4; const size_t n = 3; const double alpha = 1.0; const auto a_ld = m; const auto b_ld = n; // Initializes the OpenCL platform auto platforms = std::vector(); cl::Platform::get(&platforms); if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } auto platform = platforms[platform_id]; // Initializes the OpenCL device auto devices = std::vector(); platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); if (devices.size() == 0 || device_id >= devices.size()) { return 1; } auto device = devices[device_id]; // Creates the OpenCL context, queue, and an event auto device_as_vector = std::vector{device}; auto context = cl::Context(device_as_vector); auto queue = cl::CommandQueue(context, device); auto event = cl_event{nullptr}; // Populate host matrices with some example data auto host_a = std::vector({1.0, 2.0, 1.0, -2.0, 0.0, -1.0, -2.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, -1.0}); auto host_b = std::vector({-1.0, -1.0, 3.0, 1.0, -3.0, 2.0, 1.0, 1.0, -1.0, 4.0, -1.0, -2.0}); // Expected result: // 8 -5 2 // -11 3 4 // 5 0 -3 // -4 1 2 // Copy the matrices to the device auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(double)); auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(double)); queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(double), host_a.data()); queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data()); // Call the DTRSM routine. Note that the type of alpha and beta (double) determine the precision. auto queue_plain = queue(); auto status = clblast::Trsm(clblast::Layout::kRowMajor, clblast::Side::kLeft, clblast::Triangle::kUpper, clblast::Transpose::kNo, clblast::Diagonal::kNonUnit, m, n, alpha, device_a(), 0, a_ld, device_b(), 0, b_ld, &queue_plain, &event); // Retrieves the results if (status == clblast::StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } queue.enqueueReadBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data()); // Example completed. See "clblast.h" for status codes (0 -> success). printf("Completed TRSM with status %d and results:\n", static_cast(status)); for (auto i = size_t{0}; i < m; ++i) { for (auto j = size_t{0}; j < n; ++j) { printf("%3.0f ", host_b[i * b_ld + j]); } printf("\n"); } return 0; } // ================================================================================================= CLBlast-1.6.3/samples/haxpy.c000066400000000000000000000104101463263031500157450ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file demonstrates the use of the HAXPY routine. It demonstrates the use of half-precision. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). // // ================================================================================================= #include #include #include #define CL_TARGET_OPENCL_VERSION 120 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings // Includes the CLBlast library (C interface) #include // Includes the float-to-half and half-to-float conversion utilities #include // ================================================================================================= // Example use of the half-precision routine HAXPY int main(void) { // OpenCL platform/device settings const size_t platform_id = 0; const size_t device_id = 0; // Example HAXPY arguments const size_t n = 8192; const cl_half alpha = FloatToHalf(0.5f); // Initializes the OpenCL platform cl_uint num_platforms; clGetPlatformIDs(0, NULL, &num_platforms); cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); clGetPlatformIDs(num_platforms, platforms, NULL); cl_platform_id platform = platforms[platform_id]; // Initializes the OpenCL device cl_uint num_devices; clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); cl_device_id device = devices[device_id]; // Creates the OpenCL context, queue, and an event cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); cl_event event = NULL; // Populate host vectors with some example data cl_half* host_a = (cl_half*)malloc(sizeof(cl_half)*n); cl_half* host_b = (cl_half*)malloc(sizeof(cl_half)*n); for (size_t i=0; i success). printf("Completed HAXPY with status %d\n", status); // Prints the first output value if (status == 0) { printf("Output value at index 0: b[0] = %.3lf\n", HalfToFloat(host_b[0])); } // Clean-up free(platforms); free(devices); free(host_a); free(host_b); clReleaseMemObject(device_a); clReleaseMemObject(device_b); clReleaseCommandQueue(queue); clReleaseContext(context); return 0; } // ================================================================================================= CLBlast-1.6.3/samples/samax.c000066400000000000000000000102561463263031500157350ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file demonstrates the use of the iSAMAX routine. It is pure C99 and demonstrates the use of // the C API to the CLBlast library. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). // // ================================================================================================= #include #include #include #define CL_TARGET_OPENCL_VERSION 120 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings // Includes the CLBlast library (C interface) #include // ================================================================================================= // Example use of the single-precision routine iSAMAX int main(void) { // OpenCL platform/device settings const size_t platform_id = 0; const size_t device_id = 0; // Example iSAMAX arguments const size_t n = 1000; // Initializes the OpenCL platform cl_uint num_platforms; clGetPlatformIDs(0, NULL, &num_platforms); cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); clGetPlatformIDs(num_platforms, platforms, NULL); cl_platform_id platform = platforms[platform_id]; // Initializes the OpenCL device cl_uint num_devices; clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); cl_device_id device = devices[device_id]; // Creates the OpenCL context, queue, and an event cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); cl_event event = NULL; // Populate host data structures with some example data float* host_input = (float*)malloc(sizeof(float)*n); unsigned int* host_output = (unsigned int*)malloc(sizeof(unsigned int)*1); for (size_t i=0; i success). printf("Completed iSAMAX with status %d: array of %zu values with staircases from 0..9 repeated, max at index %u with value %.0lf\n", status, n, host_output[0], host_input[host_output[0]]); // Clean-up free(platforms); free(devices); free(host_input); free(host_output); clReleaseMemObject(device_input); clReleaseMemObject(device_output); clReleaseCommandQueue(queue); clReleaseContext(context); return 0; } // ================================================================================================= CLBlast-1.6.3/samples/sasum.c000066400000000000000000000077761463263031500157710ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file demonstrates the use of the SASUM routine. It is pure C99 and demonstrates the use of // the C API to the CLBlast library. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). // // ================================================================================================= #include #include #include #define CL_TARGET_OPENCL_VERSION 120 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings // Includes the CLBlast library (C interface) #include // ================================================================================================= // Example use of the single-precision routine SASUM int main(void) { // OpenCL platform/device settings const size_t platform_id = 0; const size_t device_id = 0; // Example SASUM arguments const size_t n = 1000; const float input_value = -1.5f; // Initializes the OpenCL platform cl_uint num_platforms; clGetPlatformIDs(0, NULL, &num_platforms); cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); clGetPlatformIDs(num_platforms, platforms, NULL); cl_platform_id platform = platforms[platform_id]; // Initializes the OpenCL device cl_uint num_devices; clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); cl_device_id device = devices[device_id]; // Creates the OpenCL context, queue, and an event cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); cl_event event = NULL; // Populate host data structures with some example data float* host_input = (float*)malloc(sizeof(float)*n); float* host_output = (float*)malloc(sizeof(float)*1); for (size_t i=0; i success). printf("Completed SASUM with status %d: %zu * |%.1lf| = %.1lf\n", status, n, input_value, host_output[0]); // Clean-up free(platforms); free(devices); free(host_input); free(host_output); clReleaseMemObject(device_input); clReleaseMemObject(device_output); clReleaseCommandQueue(queue); clReleaseContext(context); return 0; } // ================================================================================================= CLBlast-1.6.3/samples/sgemm.c000066400000000000000000000110261463263031500157300ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file demonstrates the use of the SGEMM routine. It is pure C99 and demonstrates the use of // the C API to the CLBlast library. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). // // ================================================================================================= #include #include #include #define CL_TARGET_OPENCL_VERSION 120 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings // Includes the CLBlast library (C interface) #include // ================================================================================================= // Example use of the single-precision routine SGEMM int main(void) { // OpenCL platform/device settings const size_t platform_id = 0; const size_t device_id = 0; // Example SGEMM arguments const size_t m = 128; const size_t n = 64; const size_t k = 512; const float alpha = 0.7f; const float beta = 1.0f; const size_t a_ld = k; const size_t b_ld = n; const size_t c_ld = n; // Initializes the OpenCL platform cl_uint num_platforms; clGetPlatformIDs(0, NULL, &num_platforms); cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); clGetPlatformIDs(num_platforms, platforms, NULL); cl_platform_id platform = platforms[platform_id]; // Initializes the OpenCL device cl_uint num_devices; clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); cl_device_id device = devices[device_id]; // Creates the OpenCL context, queue, and an event cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); cl_event event = NULL; // Populate host matrices with some example data float* host_a = (float*)malloc(sizeof(float)*m*k); float* host_b = (float*)malloc(sizeof(float)*n*k); float* host_c = (float*)malloc(sizeof(float)*m*n); for (size_t i=0; i success). printf("Completed SGEMM with status %d\n", status); // Clean-up free(platforms); free(devices); free(host_a); free(host_b); free(host_c); clReleaseMemObject(device_a); clReleaseMemObject(device_b); clReleaseMemObject(device_c); clReleaseCommandQueue(queue); clReleaseContext(context); return 0; } // ================================================================================================= CLBlast-1.6.3/samples/sgemm.cpp000066400000000000000000000114431463263031500162730ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file demonstrates the use of the SGEMM routine. It is a stand-alone example, but it does // require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++ // features, but CLBlast can also be used using the regular C-style OpenCL API. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). // // ================================================================================================= #include #include #include #define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings // Includes the C++ OpenCL API. If not yet available, it can be found here: // https://raw.githubusercontent.com/KhronosGroup/OpenCL-CLHPP/main/include/CL/opencl.hpp #define CL_HPP_TARGET_OPENCL_VERSION 120 #define CL_HPP_MINIMUM_OPENCL_VERSION 120 #define CL_TARGET_OPENCL_VERSION 120 #include "opencl.hpp" // Includes the CLBlast library #include // ================================================================================================= // Example use of the single-precision Xgemm routine SGEMM int main() { // OpenCL platform/device settings const auto platform_id = 0; const auto device_id = 0; // Example SGEMM arguments const size_t m = 128; const size_t n = 64; const size_t k = 512; const float alpha = 0.7f; const float beta = 1.0f; const auto a_ld = k; const auto b_ld = n; const auto c_ld = n; // Initializes the OpenCL platform auto platforms = std::vector(); cl::Platform::get(&platforms); if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } auto platform = platforms[platform_id]; // Initializes the OpenCL device auto devices = std::vector(); platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); if (devices.size() == 0 || device_id >= devices.size()) { return 1; } auto device = devices[device_id]; // Creates the OpenCL context, queue, and an event auto device_as_vector = std::vector{device}; auto context = cl::Context(device_as_vector); auto queue = cl::CommandQueue(context, device); auto event = cl_event{nullptr}; // Populate host matrices with some example data auto host_a = std::vector(m*k); auto host_b = std::vector(n*k); auto host_c = std::vector(m*n); for (auto &item: host_a) { item = 12.193f; } for (auto &item: host_b) { item = -8.199f; } for (auto &item: host_c) { item = 0.0f; } // Copy the matrices to the device auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(float)); auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(float)); auto device_c = cl::Buffer(context, CL_MEM_READ_WRITE, host_c.size()*sizeof(float)); queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(float), host_a.data()); queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(float), host_b.data()); queue.enqueueWriteBuffer(device_c, CL_TRUE, 0, host_c.size()*sizeof(float), host_c.data()); // Start the timer auto start_time = std::chrono::steady_clock::now(); // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision. auto queue_plain = queue(); auto status = clblast::Gemm(clblast::Layout::kRowMajor, clblast::Transpose::kNo, clblast::Transpose::kNo, m, n, k, alpha, device_a(), 0, a_ld, device_b(), 0, b_ld, beta, device_c(), 0, c_ld, &queue_plain, &event); // Record the execution time if (status == clblast::StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } auto elapsed_time = std::chrono::steady_clock::now() - start_time; auto time_ms = std::chrono::duration(elapsed_time).count(); // Example completed. See "clblast.h" for status codes (0 -> success). printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, static_cast(status)); return 0; } // ================================================================================================= CLBlast-1.6.3/samples/sgemm_batched.cpp000066400000000000000000000127151463263031500177500ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file demonstrates the use of the batched SGEMM routine. It is a stand-alone example, but it // does require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++ // features, but CLBlast can also be used using the regular C-style OpenCL API. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). // // ================================================================================================= #include #include #include #define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings // Includes the C++ OpenCL API. If not yet available, it can be found here: // https://raw.githubusercontent.com/KhronosGroup/OpenCL-CLHPP/main/include/CL/opencl.hpp #define CL_HPP_TARGET_OPENCL_VERSION 120 #define CL_HPP_MINIMUM_OPENCL_VERSION 120 #define CL_TARGET_OPENCL_VERSION 120 #include "opencl.hpp" // Includes the CLBlast library #include // ================================================================================================= // Example use of the single-precision batched SGEMM routine int main() { // OpenCL platform/device settings const auto platform_id = 0; const auto device_id = 0; // Example arguments const size_t batch_count = 261; const size_t m = 1; const size_t n = 1; const size_t k = 40; const auto a_ld = 2560; const auto b_ld = 160; const auto c_ld = 261; std::vector alphas(batch_count); std::vector betas(batch_count); std::vector a_offsets(batch_count); std::vector b_offsets(batch_count); std::vector c_offsets(batch_count); for (auto b_id = size_t{0}; b_id < batch_count; ++b_id) { alphas[b_id] = 1.0f; betas[b_id] = 1.0f; a_offsets[b_id] = 0; b_offsets[b_id] = 0; c_offsets[b_id] = b_id; } const auto a_size = a_ld * m; const auto b_size = b_ld * k; const auto c_size = c_ld * k; // Initializes the OpenCL platform auto platforms = std::vector(); cl::Platform::get(&platforms); if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } auto platform = platforms[platform_id]; // Initializes the OpenCL device auto devices = std::vector(); platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); if (devices.size() == 0 || device_id >= devices.size()) { return 1; } auto device = devices[device_id]; // Creates the OpenCL context, queue, and an event auto device_as_vector = std::vector{device}; auto context = cl::Context(device_as_vector); auto queue = cl::CommandQueue(context, device); auto event = cl_event{nullptr}; // Populate host matrices with some example data auto host_a = std::vector(a_size); auto host_b = std::vector(b_size); auto host_c = std::vector(c_size); for (auto &item: host_a) { item = 12.193f; } for (auto &item: host_b) { item = -8.199f; } for (auto &item: host_c) { item = 0.0f; } // Copy the matrices to the device auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(float)); auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(float)); auto device_c = cl::Buffer(context, CL_MEM_READ_WRITE, host_c.size()*sizeof(float)); queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(float), host_a.data()); queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(float), host_b.data()); queue.enqueueWriteBuffer(device_c, CL_TRUE, 0, host_c.size()*sizeof(float), host_c.data()); // Start the timer auto start_time = std::chrono::steady_clock::now(); // Calls the routine. Note that the type of alphas and betas (float) determine the precision. auto queue_plain = queue(); auto status = clblast::GemmBatched(clblast::Layout::kRowMajor, clblast::Transpose::kNo, clblast::Transpose::kNo, m, n, k, alphas.data(), device_a(), a_offsets.data(), a_ld, device_b(), b_offsets.data(), b_ld, betas.data(), device_c(), c_offsets.data(), c_ld, batch_count, &queue_plain, &event); // Record the execution time if (status == clblast::StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } auto elapsed_time = std::chrono::steady_clock::now() - start_time; auto time_ms = std::chrono::duration(elapsed_time).count(); // Example completed. See "clblast.h" for status codes (0 -> success). printf("Completed batched SGEMM in %.3lf ms with status %d\n", time_ms, static_cast(status)); return 0; } // ================================================================================================= CLBlast-1.6.3/samples/sgemm_cuda.cpp000066400000000000000000000072501463263031500172700ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file demonstrates the use of the SGEMM routine with the C++ CUDA API of CLBlast. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). // // ================================================================================================= #include #include #include // Includes the CUDA driver API #include // Includes the CLBlast library #include // ================================================================================================= // Example use of the single-precision Xgemm routine SGEMM int main() { // CUDA device selection const auto device_id = 0; // Example SGEMM arguments const size_t m = 128; const size_t n = 64; const size_t k = 512; const float alpha = 0.7f; const float beta = 1.0f; const auto a_ld = k; const auto b_ld = n; const auto c_ld = n; // Initializes the OpenCL device cuInit(0); CUdevice device; cuDeviceGet(&device, device_id); // Creates the OpenCL context and stream CUcontext context; cuCtxCreate(&context, 0, device); CUstream stream; cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING); // Populate host matrices with some example data auto host_a = std::vector(m*k); auto host_b = std::vector(n*k); auto host_c = std::vector(m*n); for (auto &item: host_a) { item = 12.193f; } for (auto &item: host_b) { item = -8.199f; } for (auto &item: host_c) { item = 0.0f; } // Copy the matrices to the device CUdeviceptr device_a; CUdeviceptr device_b; CUdeviceptr device_c; cuMemAlloc(&device_a, host_a.size()*sizeof(float)); cuMemAlloc(&device_b, host_b.size()*sizeof(float)); cuMemAlloc(&device_c, host_c.size()*sizeof(float)); cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(float), stream); cuMemcpyHtoDAsync(device_b, host_b.data(), host_b.size()*sizeof(float), stream); cuMemcpyHtoDAsync(device_c, host_c.data(), host_c.size()*sizeof(float), stream); // Start the timer auto start_time = std::chrono::steady_clock::now(); // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision. auto status = clblast::Gemm(clblast::Layout::kRowMajor, clblast::Transpose::kNo, clblast::Transpose::kNo, m, n, k, alpha, device_a, 0, a_ld, device_b, 0, b_ld, beta, device_c, 0, c_ld, context, device); cuStreamSynchronize(stream); // Record the execution time auto elapsed_time = std::chrono::steady_clock::now() - start_time; auto time_ms = std::chrono::duration(elapsed_time).count(); // Example completed. See "clblast_cuda.h" for status codes (0 -> success). printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, static_cast(status)); // Clean-up cuMemFree(device_a); cuMemFree(device_b); cuMemFree(device_c); cuStreamDestroy(stream); return 0; } // ================================================================================================= CLBlast-1.6.3/samples/sgemm_netlib.c000066400000000000000000000044301463263031500172660ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file demonstrates the use of the Netlib CBLAS API of the CLBlast library. This API is not // recommended if you want full control over performance: it will internally copy buffers from and // to the OpenCL device. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). // // ================================================================================================= #include #include #include // Includes the CLBlast library (Netlib CBLAS interface) #include // ================================================================================================= // Example use of the single-precision routine SGEMM int main(void) { // Example SGEMM arguments const int m = 128; const int n = 64; const int k = 512; const float alpha = 0.7f; const float beta = 1.0f; const int a_ld = k; const int b_ld = n; const int c_ld = n; // Populate host matrices with some example data float* host_a = (float*)malloc(sizeof(float)*m*k); float* host_b = (float*)malloc(sizeof(float)*n*k); float* host_c = (float*)malloc(sizeof(float)*m*n); for (int i=0; i // // This file demonstrates the use of the runtime tuning API. It is a stand-alone example, but it // does require the Khronos C++ OpenCL API header file (downloaded by CMake). // // ================================================================================================= #include #include #include #define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings // Includes the C++ OpenCL API. If not yet available, it can be found here: // https://raw.githubusercontent.com/KhronosGroup/OpenCL-CLHPP/main/include/CL/opencl.hpp #define CL_HPP_TARGET_OPENCL_VERSION 120 #define CL_HPP_MINIMUM_OPENCL_VERSION 120 #define CL_TARGET_OPENCL_VERSION 120 #include "opencl.hpp" // Includes the CLBlast library #include // ================================================================================================= int main() { // OpenCL platform/device settings const auto platform_id = 0; const auto device_id = 0; // Example arguments const size_t m = 128; const size_t n = 64; const auto fraction = 1.0; // between 0.0 and 1.0 // Initializes the OpenCL platform auto platforms = std::vector(); cl::Platform::get(&platforms); if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } auto platform = platforms[platform_id]; // Initializes the OpenCL device auto devices = std::vector(); platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); if (devices.size() == 0 || device_id >= devices.size()) { return 1; } auto device = devices[device_id]; // Creates the OpenCL context, queue, and an event auto device_as_vector = std::vector{device}; auto context = cl::Context(device_as_vector); auto queue = cl::CommandQueue(context, device); // Performs the tuning printf("Starting the tuning...\n"); std::unordered_map parameters; auto queue_plain = queue(); auto status = clblast::TuneCopy(&queue_plain, m, n, fraction, parameters); // Tuning completed. See "clblast.h" for status codes (0 -> success). printf("Completed TuneCopy with status %d (0 == OK), found parameters:\n", static_cast(status)); for (const auto ¶meter: parameters) { printf("> %s = %zu\n", parameter.first.c_str(), parameter.second); } // Set the new parameters status = clblast::OverrideParameters(device(), "Copy", clblast::Precision::kSingle, parameters); printf("Completed OverrideParameters with status %d (0 == OK)\n", static_cast(status)); return 0; } // ================================================================================================= CLBlast-1.6.3/scripts/000077500000000000000000000000001463263031500144775ustar00rootroot00000000000000CLBlast-1.6.3/scripts/benchmark/000077500000000000000000000000001463263031500164315ustar00rootroot00000000000000CLBlast-1.6.3/scripts/benchmark/benchmark.py000066400000000000000000000212311463263031500207340ustar00rootroot00000000000000#!/usr/bin/env python # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import argparse import json import os import sys import settings import plot import utils EXPERIMENTS = { "axpy": settings.AXPY, "axpybatched": settings.AXPYBATCHED, "gemv": settings.GEMV, "gemm": settings.GEMM, "gemm_small": settings.GEMM_SMALL, "gemmbatched": settings.GEMMBATCHED, "gemmstridedbatched": settings.GEMMSTRIDEDBATCHED, "symm": settings.SYMM, "syrk": settings.SYRK, "summary": settings.SUMMARY, } COMPARISONS = ["clBLAS", "CPU-BLAS", "cuBLAS"] COMPARISON_ARGS = ["-clblas", "-cblas", "-cublas"] COMPARISON_IDS = [2, 3, 4] def run_benchmark(name, arguments_list, precision, num_runs, platform, device, comparisons): binary = "./clblast_client_x" + name # Loops over sub-benchmarks per benchmark results = [] for arguments in arguments_list: # Sets the arguments constant_arguments = ["-warm_up", "-q", "-no_abbrv"] common_arguments = ["-precision %d" % precision, "-runs %d" % num_runs] opencl_arguments = ["-platform %d" % platform, "-device %d" % device] comparison_arguments = [] for name, arg in zip(COMPARISONS, COMPARISON_ARGS): if name in comparisons: comparison_arguments.append(arg + " 1") else: comparison_arguments.append(arg + " 0") all_arguments = opencl_arguments + common_arguments + constant_arguments + comparison_arguments for name, value in arguments.items(): all_arguments.append("-" + name + " " + str(value)) # Calls the binary and parses the results benchmark_output = utils.run_binary(binary, all_arguments) result = utils.parse_results(benchmark_output) # For half-precision: also runs single-precision for comparison if precision == 16: all_arguments = [arg if arg != "-precision 16" else "-precision 32" for arg in all_arguments] benchmark_output = utils.run_binary(binary, all_arguments) result_extra = utils.parse_results(benchmark_output) for index in range(min(len(result), len(result_extra))): result[index]["GBs_1_FP32"] = result_extra[index]["GBs_1"] result[index]["GFLOPS_1_FP32"] = result_extra[index]["GFLOPS_1"] for id in COMPARISON_IDS: if "GBs_%d" % id in result_extra[index].keys(): result[index]["GBs_%d" % id] = result_extra[index]["GBs_%d" % id] result[index]["GFLOPS_%d" % id] = result_extra[index]["GFLOPS_%d" % id] results.extend(result) return results def parse_arguments(argv): parser = argparse.ArgumentParser(description="Runs a full benchmark for a specific routine on a specific device") parser.add_argument("-b", "--benchmark", required=True, help="The benchmark to perform (choose from %s)" % sorted(EXPERIMENTS.keys())) parser.add_argument("-c", "--comparisons", default=[], nargs='+', help="The library(s) to compare against (choose from %s)" % COMPARISONS) parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on") parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on") parser.add_argument("-n", "--num_runs", type=int, default=None, help="Overrides the default number of benchmark repeats for averaging") parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464") parser.add_argument("-l", "--load_from_disk", action="store_true", help="Loading existing results from JSON file and replot") parser.add_argument("-t", "--plot_title", default="", help="The title for the plots, defaults to benchmark name") parser.add_argument("-z", "--tight_plot", action="store_true", help="Enables tight plot layout for in paper or presentation") parser.add_argument("-o", "--output_folder", default=os.getcwd(), help="Sets the folder for output plots (defaults to current folder)") parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script") cl_args = parser.parse_args(argv) return vars(cl_args) def benchmark_single(benchmark, comparisons, platform, device, num_runs, precision, load_from_disk, plot_title, tight_plot, output_folder, verbose): # Sanity check if not os.path.isdir(output_folder): print("[benchmark] Error: folder '%s' doesn't exist" % output_folder) return # The benchmark name and plot title benchmark_name = utils.precision_to_letter(precision) + benchmark.upper() if benchmark.upper() != "SUMMARY": plot_title = benchmark_name if plot_title == "" else benchmark_name + ": " + plot_title # Retrieves the comparison settings library_ids = [1] for comparison in comparisons: if comparison not in COMPARISONS: print("[benchmark] Invalid comparison library '%s', choose from %s" % (comparison, COMPARISONS)) return library_ids.append(COMPARISON_IDS[COMPARISONS.index(comparison)]) # Retrieves the benchmark settings if benchmark not in EXPERIMENTS.keys(): print("[benchmark] Invalid benchmark '%s', choose from %s" % (benchmark, EXPERIMENTS.keys())) return experiment = EXPERIMENTS[benchmark] benchmarks = experiment["benchmarks"] # Either run the benchmarks for this experiment or load old results from disk json_file_name = os.path.join(output_folder, benchmark_name.lower() + "_benchmarks.json") if load_from_disk and os.path.isfile(json_file_name): print("[benchmark] Loading previous benchmark results from '" + json_file_name + "'") with open(json_file_name) as f: results = json.load(f) else: # Runs all the individual benchmarks print("[benchmark] Running on platform %d, device %d" % (platform, device)) print("[benchmark] Running %d benchmarks for settings '%s'" % (len(benchmarks), benchmark)) results = {"label_names": ["CLBlast"] + comparisons, "num_rows": experiment["num_rows"], "num_cols": experiment["num_cols"], "benchmarks": []} for bench in benchmarks: num_runs_benchmark = bench["num_runs"] if num_runs is None else num_runs print("[benchmark] Running benchmark '%s:%s'" % (bench["name"], bench["title"])) result = run_benchmark(bench["name"], bench["arguments"], precision, num_runs_benchmark, platform, device, comparisons) results["benchmarks"].append(result) # Stores the results to disk print("[benchmark] Saving benchmark results to '" + json_file_name + "'") with open(json_file_name, "w") as f: json.dump(results, f, sort_keys=True, indent=4) # Retrieves the data from the benchmark settings file_name_suffix = "_tight" if tight_plot else "" pdf_file_name = os.path.join(output_folder, benchmark_name.lower() + "_plot" + file_name_suffix + ".pdf") titles = [b["title"] if "BATCHED" in b["name"].upper() else utils.precision_to_letter(precision) + b["name"].upper() + " " + b["title"] for b in benchmarks] x_keys = [b["x_keys"] for b in benchmarks] y_keys = [["%s_%d" % (b["y_key"], i) for i in library_ids] for b in benchmarks] x_labels = [b["x_label"] for b in benchmarks] y_labels = [b["y_label"] for b in benchmarks] label_names = results["label_names"] # For half-precision: also adds single-precision results for comparison if precision == 16: label_names[0] += " FP16" for index in range(1, len(label_names)): label_names[index] += " FP32" label_names.append("CLBlast FP32") y_keys = [y_key + [y_key[0] + "_FP32"] for y_key in y_keys] # For batched routines: comparison is non-batched if benchmark in ["axpybatched", "gemmbatched", "gemmstridedbatched"]: for index in range(1, len(label_names)): label_names[index] += " (non-batched)" # Plots the graphs plot.plot_graphs(results["benchmarks"], pdf_file_name, results["num_rows"], results["num_cols"], x_keys, y_keys, titles, x_labels, y_labels, label_names, plot_title, tight_plot, verbose) print("[benchmark] All done") if __name__ == '__main__': parsed_arguments = parse_arguments(sys.argv[1:]) benchmark_single(**parsed_arguments) CLBlast-1.6.3/scripts/benchmark/benchmark_all.py000066400000000000000000000043571463263031500215760ustar00rootroot00000000000000#!/usr/bin/env python # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import argparse import os import sys from benchmark import benchmark_single, COMPARISONS BENCHMARKS = ["axpy", "gemv", "gemm", "summary", "axpybatched", "gemmbatched", "gemmstridedbatched"] def parse_arguments(argv): parser = argparse.ArgumentParser(description="Runs all (main) benchmarks in one go for a given device") parser.add_argument("-c", "--comparisons", default=[], nargs='+', help="The library(s) to compare against (choose from %s)" % COMPARISONS) parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on") parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on") parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464") parser.add_argument("-l", "--load_from_disk", action="store_true", help="Increase verbosity of the script") parser.add_argument("-t", "--plot_title", default="", help="The title for the plots, defaults to benchmark name") parser.add_argument("-o", "--output_folder", default=os.getcwd(), help="Sets the folder for output plots (defaults to current folder)") parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script") cl_args = parser.parse_args(argv) return vars(cl_args) def benchmark_all(comparisons, platform, device, precision, load_from_disk, plot_title, output_folder, verbose): for bench in BENCHMARKS: from_disk = load_from_disk for tight_plot in [True, False]: # two plots for a single benchmark benchmark_single(bench, comparisons, platform, device, None, precision, from_disk, plot_title, tight_plot, output_folder, verbose) from_disk = True # for the next plot of the same data if __name__ == '__main__': parsed_arguments = parse_arguments(sys.argv[1:]) benchmark_all(**parsed_arguments) CLBlast-1.6.3/scripts/benchmark/plot.py000066400000000000000000000132071463263031500177640ustar00rootroot00000000000000# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import utils import matplotlib matplotlib.use('Agg') from matplotlib import rcParams import matplotlib.pyplot as plt import numpy as np # Colors BLUEISH = [c / 255.0 for c in [71, 101, 177]] # #4765b1 REDISH = [c / 255.0 for c in [214, 117, 104]] # #d67568 PURPLISH = [c / 255.0 for c in [85, 0, 119]] # #550077 GREEN = [c / 255.0 for c in [144, 224, 98]] # #90e062 COLORS = [BLUEISH, REDISH, PURPLISH, GREEN] MARKERS = ["o-", "x-", ".-", "--"] def plot_graphs(results, file_name, num_rows, num_cols, x_keys, y_keys, titles, x_labels, y_labels, label_names, title, tight_plot, verbose): assert len(results) == num_rows * num_cols assert len(results) >= 1 assert len(x_keys) == len(results) assert len(y_keys) == len(results) assert len(titles) == len(results) assert len(x_labels) == len(results) assert len(y_labels) == len(results) # Tight plot (for in a paper or presentation) or regular (for display on a screen) if tight_plot: plot_size = 5 w_space = 0.20 h_space = 0.39 title_from_top = 0.11 legend_from_top = 0.17 legend_from_top_per_item = 0.04 x_label_from_bottom = 0.09 legend_spacing = 0.0 font_size = 15 font_size_legend = 13 font_size_title = font_size bounding_box = "tight" else: plot_size = 8 w_space = 0.15 h_space = 0.22 title_from_top = 0.09 legend_from_top = 0.10 legend_from_top_per_item = 0.07 x_label_from_bottom = 0.06 legend_spacing = 0.8 font_size = 15 font_size_legend = font_size font_size_title = 18 bounding_box = None # means not 'tight' # Initializes the plot size_x = plot_size * num_cols size_y = plot_size * num_rows rcParams.update({'font.size': font_size}) fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(size_x, size_y), facecolor='w', edgecolor='k') if len(results) == 1 and not type(axes) is np.ndarray: axes = np.full((1,1), axes) assert type(axes) is np.ndarray fig.text(.5, 0.92, title, horizontalalignment="center", fontsize=font_size_title) plt.subplots_adjust(wspace=w_space, hspace=h_space) # Loops over each subplot for row in range(num_rows): for col in range(num_cols): index = row * num_cols + col result = results[index] if num_rows == 1: ax = axes[col] elif num_cols == 1: ax = axes[row] else: ax = axes[row, col] plt.sca(ax) print("[plot] Plotting subplot %d" % index) # Sets the x-axis labels x_list = [[r[x_key] for r in result] for x_key in x_keys[index]] x_ticks = [",".join([utils.float_to_kilo_mega(v) for v in values]) for values in zip(*x_list)] x_location = range(len(x_ticks)) # Optional sparsifying of the labels on the x-axis if tight_plot and len(x_location) > 10: x_ticks = [v if not (i % 2) else "" for i, v in enumerate(x_ticks)] # Sets the y-data y_list = [[r[y_key] if y_key in r.keys() and not isinstance(r[y_key], str) else 0 for r in result] for y_key in y_keys[index]] y_max = [max(y) if len(y) else 1 for y in y_list] y_max = max(y_max) if len(y_list) > 0 else 1 # Sets the axes y_rounding = 10 if y_max < 80 else 50 if y_max < 400 else 200 y_axis_limit = (y_max * 1.2) - ((y_max * 1.2) % y_rounding) + y_rounding plt.ylim(ymin=0, ymax=y_axis_limit) plt.xticks(x_location, x_ticks, rotation='vertical') # Sets the labels ax.set_title(titles[index], y=1.0 - title_from_top, fontsize=font_size) if col == 0 or y_labels[index] != y_labels[index - 1]: ax.set_ylabel(y_labels[index]) ax.set_xlabel(x_labels[index]) ax.xaxis.set_label_coords(0.5, x_label_from_bottom) # Plots the graph assert len(COLORS) >= len(y_keys[index]) assert len(MARKERS) >= len(y_keys[index]) assert len(label_names) == len(y_keys[index]) for i in range(len(y_keys[index])): color = COLORS[i] marker = MARKERS[i] if label_names[i] in ["CLBlast", "CLBlast FP32"]: color = BLUEISH marker = "o-" elif label_names[i] in ["CLBlast FP16"]: color = PURPLISH marker = ".-" elif label_names[i] in ["clBLAS", "clBLAS FP32", "clBLAS (non-batched)"]: color = REDISH marker = "x-" elif label_names[i] in ["cuBLAS", "cuBLAS (non-batched)"]: color = GREEN marker = ".-" ax.plot(x_location, y_list[i], marker, label=label_names[i], color=color) # Sets the legend leg = ax.legend(loc=(0.02, 1.0 - legend_from_top - legend_from_top_per_item * len(y_keys[index])), handletextpad=0.1, labelspacing=legend_spacing, fontsize=font_size_legend) leg.draw_frame(False) # Saves the plot to disk print("[benchmark] Saving plot to '" + file_name + "'") fig.savefig(file_name, bbox_inches=bounding_box) CLBlast-1.6.3/scripts/benchmark/settings.py000066400000000000000000000423171463263031500206520ustar00rootroot00000000000000#!/usr/bin/env python # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import utils AXPY = { "num_rows": 2, "num_cols": 3, "benchmarks": [ { "name": "axpy", "num_runs": 40, "title": "multiples of 256K", "x_label": "sizes (n)", "x_keys": ["n"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": utils.k(256), "incx": 1, "incy": 1, "step": utils.k(256), "num_steps": 16}], }, { "name": "axpy", "num_runs": 40, "title": "multiples of 256K+1", "x_label": "sizes (n)", "x_keys": ["n"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": utils.k(256) + 1, "incx": 1, "incy": 1, "step": utils.k(256) + 1, "num_steps": 16}], }, { "name": "axpy", "num_runs": 40, "title": "around 1M", "x_label": "sizes (n)", "x_keys": ["n"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": utils.m(1), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}], }, { "name": "axpy", "num_runs": 20, "title": "around 16M", "x_label": "sizes (n)", "x_keys": ["n"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": utils.m(16), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}], }, { "name": "axpy", "num_runs": 20, "title": "strides n=8M", "x_label": "increments for x,y", "x_keys": ["incx", "incy"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": utils.m(8), "incx": inc_x, "incy": inc_y, "step": 0, "num_steps": 1} for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]], }, { "name": "axpy", "num_runs": 40, "title": "powers of 2", "x_label": "sizes (n)", "x_keys": ["n"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1} for n in utils.powers_of_2(utils.k(32), utils.m(64))], } ] } AXPYBATCHED = { "num_rows": 1, "num_cols": 3, "benchmarks": [ { "name": "axpybatched", "num_runs": 10, "title": "num AXPYs = 8", "x_label": "sizes (n)", "x_keys": ["n"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"batch_num": 8, "n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1} for n in utils.powers_of_2(utils.k(8), utils.m(4))], }, { "name": "axpybatched", "num_runs": 5, "title": "num AXPYs = 64", "x_label": "sizes (n)", "x_keys": ["n"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"batch_num": 64, "n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1} for n in utils.powers_of_2(utils.k(8), utils.m(4))], }, { "name": "axpybatched", "num_runs": 10, "title": "n=512K", "x_label": "num AXPYs", "x_keys": ["batch_num"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"batch_num": b, "n": utils.k(512), "incx": 1, "incy": 1, "step": 1, "num_steps": 1} for b in utils.powers_of_2(1, 256)], } ] } GEMV = { "num_rows": 2, "num_cols": 3, "benchmarks": [ { "name": "gemv", "num_runs": 40, "title": "multiples of 256", "x_label": "sizes (n=m)", "x_keys": ["n"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 102, "step": 256, "num_steps": 20}], }, { "name": "gemv", "num_runs": 40, "title": "multiples of 257", "x_label": "sizes (n=m)", "x_keys": ["n"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 102, "step": 257, "num_steps": 20}], }, { "name": "gemv", "num_runs": 20, "title": "around 4K", "x_label": "sizes (n=m)", "x_keys": ["n"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": 4096, "m": 4096, "incx": 1, "incy": 1, "layout": 102, "step": 1, "num_steps": 16}], }, { "name": "gemv", "num_runs": 40, "title": "multiples of 256 rotated", "x_label": "sizes (n=m)", "x_keys": ["n"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 101, "step": 256, "num_steps": 20}], }, { "name": "gemv", "num_runs": 40, "title": "multiples of 257 rotated", "x_label": "sizes (n=m)", "x_keys": ["n"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 101, "step": 257, "num_steps": 20}], }, { "name": "gemv", "num_runs": 20, "title": "strides n=m=4K", "x_label": "increments/strides for x,y", "x_keys": ["incx", "incy"], "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": 4096, "m": 4096, "incx": inc_x, "incy": inc_y, "layout": 102, "step": 0, "num_steps": 1} for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]], } ] } GEMM = { "num_rows": 2, "num_cols": 3, "benchmarks": [ { "name": "gemm", "num_runs": 20, "title": "multiples of 128", "x_label": "sizes (m=n=k)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102, "transA": 111, "transB": 111, "step": 128, "num_steps": 20}], }, { "name": "gemm", "num_runs": 20, "title": "multiples of 129", "x_label": "sizes (m=n=k)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 129, "n": 129, "k": 129, "layout": 102, "transA": 111, "transB": 111, "step": 129, "num_steps": 20}], }, { "name": "gemm", "num_runs": 20, "title": "around 512", "x_label": "sizes (m=n=k)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 512, "n": 512, "k": 512, "layout": 102, "transA": 111, "transB": 111, "step": 1, "num_steps": 16}], }, { "name": "gemm", "num_runs": 10, "title": "around 2048", "x_label": "sizes (m=n=k)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 2048, "n": 2048, "k": 2048, "layout": 102, "transA": 111, "transB": 111, "step": 1, "num_steps": 16}], }, { "name": "gemm", "num_runs": 10, "title": "layouts/transpose", "x_label": "layout, transA, transB", "x_keys": ["layout", "transA", "transB"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 1024, "n": 1024, "k": 1024, "layout": layout, "transA": transA, "transB": transB, "step": 0, "num_steps": 1} for layout in [101, 102] for transA in [111, 112] for transB in [111, 112]], }, { "name": "gemm", "num_runs": 10, "title": "powers of 2", "x_label": "sizes (m=n=k)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": n, "n": n, "k": n, "layout": 102, "transA": 111, "transB": 111, "step": 0, "num_steps": 1} for n in utils.powers_of_2(8, utils.k(4))], } ] } GEMM_SMALL = { "num_rows": 2, "num_cols": 1, "benchmarks": [ { "name": "gemm", "num_runs": 10, "title": "small matrices in steps of 16", "x_label": "sizes (m=n=k)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102, "transA": 111, "transB": 111, "step": 16, "num_steps": 57}], }, { "name": "gemm", "num_runs": 10, "title": "small matrices in steps of 1", "x_label": "sizes (m=n=k)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102, "transA": 111, "transB": 111, "step": 1, "num_steps": 385}], }, ] } GEMMBATCHED = { "num_rows": 1, "num_cols": 3, "benchmarks": [ { "name": "gemmbatched", "num_runs": 20, "title": "num GEMMs = 8", "x_label": "sizes (m=n=k)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"batch_num": 8, "m": 32, "n": 32, "k": 32, "layout": 102, "transA": 111, "transB": 111, "step": 32, "num_steps": 20}], }, { "name": "gemmbatched", "num_runs": 10, "title": "num GEMMs = 64", "x_label": "sizes (m=n=k)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"batch_num": 64, "m": 32, "n": 32, "k": 32, "layout": 102, "transA": 111, "transB": 111, "step": 32, "num_steps": 20}], }, { "name": "gemmbatched", "num_runs": 10, "title": "m=n=k=128", "x_label": "num GEMMs", "x_keys": ["batch_num"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"batch_num": b, "m": 128, "n": 128, "k": 128, "layout": 102, "transA": 111, "transB": 111} for b in utils.powers_of_2(1, utils.k(4))], } ] } GEMMSTRIDEDBATCHED = { "num_rows": 1, "num_cols": 3, "benchmarks": [ { "name": "gemmstridedbatched", "num_runs": 20, "title": "num GEMMs = 8", "x_label": "sizes (m=n=k)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"batch_num": 8, "m": 32, "n": 32, "k": 32, "layout": 102, "transA": 111, "transB": 111, "step": 32, "num_steps": 20}], }, { "name": "gemmstridedbatched", "num_runs": 10, "title": "num GEMMs = 64", "x_label": "sizes (m=n=k)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"batch_num": 64, "m": 32, "n": 32, "k": 32, "layout": 102, "transA": 111, "transB": 111, "step": 32, "num_steps": 20}], }, { "name": "gemmstridedbatched", "num_runs": 10, "title": "m=n=k=128", "x_label": "num GEMMs", "x_keys": ["batch_num"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"batch_num": b, "m": 128, "n": 128, "k": 128, "layout": 102, "transA": 111, "transB": 111} for b in utils.powers_of_2(1, utils.k(4))], } ] } SYMM = { "num_rows": 2, "num_cols": 3, "benchmarks": [ { "name": "symm", "num_runs": 10, "title": "multiples of 128", "x_label": "sizes (m=n)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 128, "n": 128, "layout": 102, "side": 141, "triangle": 121, "step": 128, "num_steps": 20}], }, { "name": "symm", "num_runs": 10, "title": "multiples of 129", "x_label": "sizes (m=n)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 129, "n": 129, "layout": 102, "side": 141, "triangle": 121, "step": 129, "num_steps": 20}], }, { "name": "symm", "num_runs": 10, "title": "around 512", "x_label": "sizes (m=n)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 512, "n": 512, "layout": 102, "side": 141, "triangle": 121, "step": 1, "num_steps": 16}], }, { "name": "symm", "num_runs": 10, "title": "around 2048", "x_label": "sizes (m=n)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 2048, "n": 2048, "layout": 102, "side": 141, "triangle": 121, "step": 1, "num_steps": 16}], }, { "name": "symm", "num_runs": 10, "title": "layouts/sides/triangles", "x_label": "layout, side, triangle", "x_keys": ["layout", "side", "triangle"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 1024, "n": 1024, "layout": layout, "side": side, "triangle": triangle, "step": 0, "num_steps": 1} for layout in [101, 102] for side in [141, 142] for triangle in [121, 122]], }, { "name": "symm", "num_runs": 10, "title": "powers of 2", "x_label": "sizes (m=n)", "x_keys": ["m"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": n, "n": n, "layout": 102, "side": 141, "triangle": 121, "step": 0, "num_steps": 1} for n in utils.powers_of_2(8, utils.k(4))], } ] } SYRK = { "num_rows": 2, "num_cols": 3, "benchmarks": [ { "name": "syrk", "num_runs": 10, "title": "multiples of 128", "x_label": "sizes (n=k)", "x_keys": ["n"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"n": 128, "k": 128, "layout": 102, "side": 141, "triangle": 121, "step": 128, "num_steps": 20}], }, { "name": "syrk", "num_runs": 10, "title": "multiples of 129", "x_label": "sizes (n=k)", "x_keys": ["n"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"n": 129, "k": 129, "layout": 102, "side": 141, "triangle": 121, "step": 129, "num_steps": 20}], }, { "name": "syrk", "num_runs": 10, "title": "around 512", "x_label": "sizes (n=k)", "x_keys": ["n"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"n": 512, "k": 512, "layout": 102, "side": 141, "triangle": 121, "step": 1, "num_steps": 16}], }, { "name": "syrk", "num_runs": 10, "title": "around 2048", "x_label": "sizes (n=k)", "x_keys": ["n"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"n": 2048, "k": 2048, "layout": 102, "side": 141, "triangle": 121, "step": 1, "num_steps": 16}], }, { "name": "syrk", "num_runs": 10, "title": "layouts/sides/triangles", "x_label": "layout, triangle, transA", "x_keys": ["layout", "triangle", "transA"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"n": 1024, "k": 1024, "layout": layout, "triangle": triangle, "transA": transA, "step": 0, "num_steps": 1} for layout in [101, 102] for triangle in [121, 122] for transA in [111, 112]], }, { "name": "syrk", "num_runs": 10, "title": "powers of 2", "x_label": "sizes (n=k)", "x_keys": ["n"], "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"n": n, "k": n, "layout": 102, "side": 141, "triangle": 121, "step": 0, "num_steps": 1} for n in utils.powers_of_2(8, utils.k(4))], } ] } SUMMARY = { "num_rows": 3, "num_cols": 2, "benchmarks": [ AXPY["benchmarks"][0], AXPY["benchmarks"][1], GEMV["benchmarks"][0], GEMV["benchmarks"][1], GEMM["benchmarks"][0], GEMM["benchmarks"][1], ] } CLBlast-1.6.3/scripts/benchmark/utils.py000066400000000000000000000035301463263031500201440ustar00rootroot00000000000000# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import csv import subprocess def k(value): return value * 1024 def m(value): return value * 1024 * 1024 def float_to_kilo_mega(value): if value % 1024 or value <= 1024: return "%.0f" % value elif value % (1024 * 1024) or value <= (1024 * 1024): return "%.0fK" % (value / 1024.0) else: return "%.0fM" % (value / (1024.0 * 1024.0)) def powers_of_2(start, stop): while start <= stop: yield start start *= 2 def precision_to_letter(precision): if precision == 16: return "H" elif precision == 32: return "S" elif precision == 64: return "D" elif precision == 3232: return "C" elif precision == 6464: return "Z" else: return "X" def run_binary(command, arguments): full_command = command + " " + " ".join(arguments) print("[benchmark] Calling binary: %s" % str(full_command)) try: result = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE).stdout.read() return result.decode("ascii") except OSError as e: print("[benchmark] Error while running the binary, got exception: %s" + str(e)) return False def parse_results(csv_data): csv_data = csv_data.split("\n") results = csv.DictReader(csv_data, delimiter=";", skipinitialspace=True) results = [r for r in results] for result in results: for key in result: if "i" in result[key]: continue else: result[key] = float(result[key]) if "." in result[key] else int(result[key]) return results CLBlast-1.6.3/scripts/database/000077500000000000000000000000001463263031500162435ustar00rootroot00000000000000CLBlast-1.6.3/scripts/database/database.py000077500000000000000000000210501463263031500203620ustar00rootroot00000000000000#!/usr/bin/env python # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import sys import os.path import glob import argparse import database.io as io import database.db as db import database.clblast as clblast import database.bests as bests import database.defaults as defaults # Server storing a copy of the database DATABASE_SERVER_URL = "https://raw.githubusercontent.com/CNugteren/CLBlast-database/master/database.json" def remove_mismatched_arguments(database): """Checks for tuning results with mis-matched entries and removes them according to user preferences""" kernel_attributes = clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"] # For Python 2 and 3 compatibility try: user_input = raw_input except NameError: user_input = input pass # Check for mis-matched entries for kernel_group_name, kernel_group in db.group_by(database["sections"], kernel_attributes): group_by_arguments = db.group_by(kernel_group, clblast.ARGUMENT_ATTRIBUTES) if len(group_by_arguments) != 1: print("[database] WARNING: entries for a single kernel with multiple argument values " + str(kernel_group_name)) print("[database] Either quit or remove all but one of the argument combinations below:") for index, (attribute_group_name, mismatching_entries) in enumerate(group_by_arguments): print("[database] %d: %s" % (index, attribute_group_name)) for attribute_group_name, mismatching_entries in group_by_arguments: response = user_input("[database] Remove entries corresponding to %s, [y/n]? " % str(attribute_group_name)) if response == "y": for entry in mismatching_entries: database["sections"].remove(entry) print("[database] Removed %d entry/entries" % len(mismatching_entries)) # Sanity-check: all mis-matched entries should be removed for kernel_group_name, kernel_group in db.group_by(database["sections"], kernel_attributes): group_by_arguments = db.group_by(kernel_group, clblast.ARGUMENT_ATTRIBUTES) if len(group_by_arguments) != 1: print("[database] ERROR: entries for a single kernel with multiple argument values " + str(kernel_group_name)) assert len(group_by_arguments) == 1 def remove_database_entries(database, remove_if_matches_fields): assert len(remove_if_matches_fields.keys()) > 0 def remove_this_entry(section): for key in remove_if_matches_fields.keys(): if section[key] != remove_if_matches_fields[key]: return False return True old_length = len(database["sections"]) database["sections"] = [x for x in database["sections"] if not remove_this_entry(x)] new_length = len(database["sections"]) print("[database] Removed %d entries from the database" % (old_length - new_length)) def add_tuning_parameter(database, parameter_name, kernel, value): num_changes = 0 for section in database["sections"]: if section["kernel"] == kernel: for result in section["results"]: if parameter_name not in result["parameters"]: result["parameters"][parameter_name] = value section["parameter_names"].append(parameter_name) num_changes += 1 print("[database] Made %d addition(s) of %s" % (num_changes, parameter_name)) def main(argv): # Parses the command-line arguments parser = argparse.ArgumentParser() parser.add_argument("source_folder", help="The folder with JSON files to parse to add to the database") parser.add_argument("clblast_root", help="Root of the CLBlast sources") parser.add_argument("-r", "--remove_device", type=str, default=None, help="Removes all entries for a specific device") parser.add_argument("--add_tuning_parameter", type=str, default=None, help="Adds this parameter to existing entries") parser.add_argument("--add_tuning_parameter_for_kernel", type=str, default=None, help="Adds the above parameter for this kernel") parser.add_argument("--add_tuning_parameter_value", type=int, default=0, help="Set this value as the default for the above parameter") parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script") cl_args = parser.parse_args(argv) # Parses the path arguments database_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database.json") database_best_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database_best.json") json_files = os.path.join(cl_args.source_folder, "*.json") cpp_database_path = os.path.join(cl_args.clblast_root, "src", "database", "kernels") # Checks whether the command-line arguments are valid clblast_header = os.path.join(cl_args.clblast_root, "include", "clblast.h") # Not used but just for validation if not os.path.isfile(clblast_header): raise RuntimeError("The path '" + cl_args.clblast_root + "' does not point to the root of the CLBlast library") if len(glob.glob(json_files)) < 1: print("[database] The path '" + cl_args.source_folder + "' does not contain any JSON files") # Downloads the database if a local copy is not present if not os.path.isfile(database_filename): io.download_database(database_filename, DATABASE_SERVER_URL) # Loads the database from disk database = io.load_database(database_filename) # Loops over all JSON files in the supplied folder for file_json in glob.glob(json_files): sys.stdout.write("[database] Processing '" + file_json + "' ") # No newline printed try: # Loads the newly imported data imported_data = io.load_tuning_results(file_json) # Adds the new data to the database old_size = db.length(database) database = db.add_section(database, imported_data) new_size = db.length(database) print("with " + str(new_size - old_size) + " new items") # Newline printed here except ValueError: print("--- WARNING: invalid file, skipping") # Checks for tuning results with mis-matched entries remove_mismatched_arguments(database) # Stores the modified database back to disk if len(glob.glob(json_files)) >= 1: io.save_database(database, database_filename) # Removes database entries before continuing if cl_args.remove_device is not None: print("[database] Removing all results for device '%s'" % cl_args.remove_device) remove_database_entries(database, {"clblast_device_name": cl_args.remove_device}) #, "kernel_family": "xgemm"}) io.save_database(database, database_filename) # Adds new tuning parameters to existing database entries if cl_args.add_tuning_parameter is not None and\ cl_args.add_tuning_parameter_for_kernel is not None: print("[database] Adding tuning parameter: '%s' for kernel '%s' with default %d" % (cl_args.add_tuning_parameter, cl_args.add_tuning_parameter_for_kernel, cl_args.add_tuning_parameter_value)) add_tuning_parameter(database, cl_args.add_tuning_parameter, cl_args.add_tuning_parameter_for_kernel, cl_args.add_tuning_parameter_value) io.save_database(database, database_filename) # Retrieves the best performing results print("[database] Calculating the best results per device/kernel...") database_best_results = bests.get_best_results(database) # Determines the defaults for other vendors and per vendor print("[database] Calculating the default values...") database_defaults = defaults.calculate_defaults(database, cl_args.verbose) database_best_results["sections"].extend(database_defaults["sections"]) # Optionally outputs the database to disk if cl_args.verbose: io.save_database(database_best_results, database_best_filename) # Outputs the database as a C++ database print("[database] Producing a C++ database in '" + cpp_database_path + "'...") clblast.print_cpp_database(database_best_results, cpp_database_path) print("[database] All done") if __name__ == '__main__': main(sys.argv[1:]) CLBlast-1.6.3/scripts/database/database/000077500000000000000000000000001463263031500200075ustar00rootroot00000000000000CLBlast-1.6.3/scripts/database/database/__init__.py000066400000000000000000000000001463263031500221060ustar00rootroot00000000000000CLBlast-1.6.3/scripts/database/database/bests.py000066400000000000000000000044701463263031500215060ustar00rootroot00000000000000 # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import sys import database.clblast as clblast def get_best_results(database): """Retrieves the results with the lowest execution times""" sections_best = [] for section in database["sections"]: section_best = {} # Stores all the section's meta data for attribute in section.keys(): if attribute != "results": section_best[attribute] = section[attribute] if section_best["clblast_device_architecture"] == "" and section_best["clblast_device_vendor"] in clblast.VENDORS_WITH_ARCHITECTURE: section_best["clblast_device_architecture"] = clblast.DEVICE_ARCHITECTURE_DEFAULT # Find the best result parameters_best = None time_best = sys.float_info.max for result in section["results"]: if result["time"] < time_best: time_best = result["time"] parameters_best = result["parameters"] # Stores the best result section_best["results"] = [{"time": time_best, "parameters": parameters_best}] sections_best.append(section_best) return {"sections": sections_best} def get_relative_bests(name, common_results, common_parameters, verbose=False): """Retrieves the parameters with the relative best execution time over different devices""" # Helper function def argmin(iterable): return min(enumerate(iterable), key=lambda x: x[1])[0] # Computes the sum of the execution times over the different devices performance_sums = [] for parameters in common_parameters: performance_sum = sum([r["relative_time"] for r in common_results if r["parameters"] == parameters]) performance_sums.append(performance_sum) # Retrieves the entry with the lowest time best_index = argmin(performance_sums) best_performance = performance_sums[best_index] best_parameters = common_parameters[best_index] # Completed, report and return the results if verbose: print("[database] " + str(name) + " with performance " + str(best_performance)) return best_parameters CLBlast-1.6.3/scripts/database/database/clblast.py000066400000000000000000000314701463263031500220120ustar00rootroot00000000000000 # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import os # Type settings (also change in database_structure.hpp) STRING_LENGTH = 50 PARAMETERS_LENGTH = 16 # Constants from the C++ code VENDOR_DEFAULT = "default" DEVICE_TYPE_DEFAULT = "All" DEVICE_NAME_DEFAULT = "default" DEVICE_NAME_DEFAULT_CONSTANT = "kDeviceNameDefault " DEVICE_ARCHITECTURE_DEFAULT = "default" # List of attributes DEVICE_TYPE_ATTRIBUTES = ["clblast_device_vendor", "clblast_device_type"] DEVICE_ATTRIBUTES = ["clblast_device_name", "clblast_device_architecture", "device_core_clock", "device_compute_units"] KERNEL_ATTRIBUTES = ["precision", "kernel_family"] ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta", "arg_from", "arg_to", "arg_step", "arg_channels", "arg_height", "arg_width", "arg_kernel_h", "arg_kernel_w", "arg_num_kernels", "arg_batch_count"] ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES GROUP_ATTRIBUTES = DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ["kernel"] + ARGUMENT_ATTRIBUTES # Other constants VENDORS_WITH_ARCHITECTURE = ["AMD", "NVIDIA"] def precision_to_string(precision): """Translates a precision number (represented as Python string) into a descriptive string""" if precision == "16": return "Half" elif precision == "32": return "Single" elif precision == "64": return "Double" elif precision == "3232": return "ComplexSingle" elif precision == "6464": return "ComplexDouble" else: raise("Unknown precision: " + precision) def get_cpp_separator(): """Retrieves a C++ comment separator""" return "// =================================================================================================" def get_cpp_header(family, precision): """Retrieves the C++ header""" return ("\n" + get_cpp_separator() + """ // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the '%s%s' kernels. //\n""" % (family.title(), precision)) + get_cpp_separator() + "\n" def get_cpp_header_namespace(): return "\nnamespace clblast {\n" + "namespace database {\n" def get_cpp_footer(): """Retrieves the C++ footer""" return "\n} // namespace database\n" + "} // namespace clblast\n" def get_cpp_precision(family, precision): """Retrieves the C++ code for the start of a new precision""" precision_string = precision_to_string(precision) camelcase_name = family.title().replace("_", "") return("\nconst DatabaseEntry %s%s = {\n \"%s\", Precision::k%s" % (camelcase_name, precision_string, camelcase_name, precision_string)) def get_cpp_device_vendor(vendor, device_type): """Retrieves the C++ code for the (default) vendor and device type""" if vendor == VENDOR_DEFAULT and device_type == DEVICE_TYPE_DEFAULT: return " { // Default\n kDeviceType%s, \"%s\", {\n" % (device_type, vendor) device_type_caps = device_type[0].upper() + device_type[1:] return " { // %s %ss\n kDeviceType%s, \"%s\", {\n" % (vendor, device_type, device_type_caps, vendor) def get_cpp_family_includes(family, precisions): result = "\n" result += "#include \"database/kernels/%s/%s.hpp\"\n" % (family, family) for precision in precisions: result += "#include \"database/kernels/%s/%s_%s.hpp\"\n" % (family, family, precision) return result def get_hpp_family_includes(family, precisions): result = "\n" result += "#include \"database/database_structure.hpp\"\n" result += "\n" result += "namespace clblast {\n" result += "namespace database {\n" result += "\n" camelcase_name = family.title().replace("_", "") for precision in precisions: precision_string = precision_to_string(precision) result += "extern const DatabaseEntry %s%s;\n" % (camelcase_name, precision_string) result += "\n" result += "} // namespace database\n" result += "} // namespace clblast\n" return result def print_as_name(name): return "Name{\"%-50s\"}" % name.strip()[:STRING_LENGTH] def get_kernel_database_results(kernel_database): """Retrieves the best result from a group of results. Asserts for valid data""" assert len(kernel_database) >= 1 all_results = [item["results"] for item in kernel_database] best_results = all_results[0] for results in all_results: # Debugging in case of unexpected results length_assumption = (len(results) == 1) params_assumption = (sorted(results[0]["parameters"]) == sorted(best_results[0]["parameters"])) if not length_assumption or not params_assumption: print("[database] ERROR: Found %d kernel databases, expected 1" % len(kernel_database)) all_keys = sorted([key for item in kernel_database for key in item.keys()]) missing_keys = set([x for x in all_keys if all_keys.count(x) != len(kernel_database)]) print("[database] All keys in databases: %s" % str(set(all_keys))) print("[database] Missing keys in one or more databases: %s" % str(missing_keys)) for index, item in enumerate(kernel_database): print("[database] %d:" % index) print(item) assert length_assumption assert params_assumption if results[0]["time"] < best_results[0]["time"]: best_results = results return best_results def print_cpp_database(database, output_dir): """Outputs the database as C++ code""" # Iterates over the kernel families kernel_families = sorted(set([s["kernel_family"] for s in database["sections"]])) for family_name in kernel_families: family_database = [s for s in database["sections"] if s["kernel_family"] == family_name] # Goes into a new path for each kernel family family_path = os.path.join(output_dir, family_name) # Loops over the different precision (e.g. 16, 32, 3232, 64, 6464) precisions = sorted(set([s["precision"] for s in database["sections"]])) # Based on full database for precision in precisions: precision_database = [s for s in family_database if s["precision"] == precision] # Opens a new file for each precision full_path = os.path.join(family_path, family_name + "_" + precision + ".hpp") with open(full_path, 'w+') as f: f.write(get_cpp_header(family_name, precision)) f.write(get_cpp_header_namespace()) f.write(get_cpp_precision(family_name, precision)) # In case there is nothing found at all (e.g. 16-bit): continue as if this was a # precision of 32 but with the defaults only if len(precision_database) == 0: print("[database] No results found for %s:%s, retrieving defaults from %s:32" % (family_name, precision, family_name)) precision_database = [s for s in family_database if s["precision"] == "32" and s["clblast_device_vendor"] == VENDOR_DEFAULT and s["clblast_device_type"] == DEVICE_TYPE_DEFAULT and s["clblast_device_name"] == DEVICE_NAME_DEFAULT] # Discovers the parameters for this kernel parameter_names = [] for example_data in precision_database: for example_result in example_data["results"]: parameter_names.extend([str(k) for k in example_result["parameters"].keys()]) parameter_names = sorted(list(set(parameter_names))) parameter_names_as_string = ", ".join(['"%s"' % p for p in parameter_names]) f.write(", {" + parameter_names_as_string + "}, {\n") # Loops over device vendors (e.g. AMD) device_vendors = sorted(set([s["clblast_device_vendor"] for s in precision_database])) for vendor in device_vendors: vendor_database = [s for s in precision_database if s["clblast_device_vendor"] == vendor] # Loops over device types (e.g. GPU) device_types = sorted(set([s["clblast_device_type"] for s in vendor_database])) for device_type in device_types: type_database = [s for s in vendor_database if s["clblast_device_type"] == device_type] f.write(get_cpp_device_vendor(vendor, device_type)) # Loops over every architecture of this vendor-type combination architectures = sorted(set([s["clblast_device_architecture"] for s in type_database])) if vendor in VENDORS_WITH_ARCHITECTURE: architectures = [a for a in architectures if a != ""] for architecture in architectures: architecture_database = [s for s in type_database if s["clblast_device_architecture"] == architecture] architecture_string = DEVICE_ARCHITECTURE_DEFAULT if architecture == "" else architecture f.write(" { \"%s\", {\n" % architecture_string) # Loops over every device of this vendor-type combination devices = sorted(set([s["clblast_device_name"] for s in architecture_database])) for device_name in devices: device_database = [s for s in architecture_database if s["clblast_device_name"] == device_name] device_name_as_string = print_as_name(device_name) if device_name != DEVICE_NAME_DEFAULT else DEVICE_NAME_DEFAULT_CONSTANT device_name_cpp = " { %s, Params{ " % device_name_as_string f.write(device_name_cpp) # Collects the parameters for this entry parameters = [] parameter_index = 0 kernels = sorted(set([s["kernel"] for s in device_database])) for kernel in kernels: kernel_database = [s for s in device_database if s["kernel"] == kernel] results = get_kernel_database_results(kernel_database) assert len(results) == 1 new_parameters = results[0]["parameters"] for parameter_name in sorted(new_parameters): assert parameter_name == parameter_names[parameter_index] parameter_value = new_parameters[parameter_name] parameters.append(str(parameter_value)) parameter_index += 1 # Appends zero's to complete the list assert parameter_index <= PARAMETERS_LENGTH for append_index in range(parameter_index, PARAMETERS_LENGTH): parameters.append("0") # Prints the entry f.write(", ".join(parameters)) f.write(" } },\n") # Prints the architecture footer f.write(" } },\n") # Prints the vendor-type combination footer f.write(" }\n },\n") # Prints the precision footer f.write(" }\n};\n") # Prints the file footer f.write(get_cpp_footer()) # Creates the combined family sources full_path = os.path.join(family_path, family_name + ".cpp") with open(full_path, 'w+') as f: f.write(get_cpp_header(family_name, "")) f.write(get_cpp_family_includes(family_name, precisions)) # Creates the combined family includes header full_path = os.path.join(family_path, family_name + ".hpp") with open(full_path, 'w+') as f: f.write(get_cpp_header(family_name, "")) f.write(get_hpp_family_includes(family_name, precisions)) CLBlast-1.6.3/scripts/database/database/db.py000066400000000000000000000052441463263031500207530ustar00rootroot00000000000000 # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import itertools from operator import itemgetter def length(database): """Computes the total number of tuning entries""" num_tuning_entries = 0 for section in database["sections"]: num_tuning_entries += len(section["results"]) return num_tuning_entries def add_section(database, new_section): """Adds a new section to the database""" for old_section in database["sections"]: # Verify whether the sections match equal = True for attribute in new_section.keys(): if attribute != "results": if attribute not in old_section or new_section[attribute] != old_section[attribute]: equal = False break # They match: append the new section's results to the corresponding entry in the database and return if equal: old_section["results"] = combine_results(old_section["results"], new_section["results"]) return database # No match found: append the whole new section to the database database["sections"].append(new_section) return database def combine_results(old_results, new_results): """Adds new results to the results JSON list""" for new_result in new_results: old_results = combine_result(old_results, new_result) return old_results def combine_result(old_results, new_result): """Adds a new result to the results JSON list; filters for duplicate entries and saves the best performing one""" # Loops over all existing results to test for already existing entries with these parameters for old_result in old_results: # Verify whether the results match equal = new_result["parameters"] == old_result["parameters"] # They match: keep only the one with the minimum execution time if equal: old_result["time"] = min(old_result["time"], new_result["time"]) return old_results # No match found: append a new result old_results.append(new_result) return old_results def group_by(database, attributes): """Returns an list with the name of the group and the corresponding entries in the database""" assert len(database) > 0 attributes = [a for a in attributes if a in database[0]] database.sort(key=itemgetter(*attributes)) result = [] for key, data in itertools.groupby(database, key=itemgetter(*attributes)): result.append((key, list(data))) return result CLBlast-1.6.3/scripts/database/database/defaults.py000066400000000000000000000261261463263031500221770ustar00rootroot00000000000000 # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import ast from collections import defaultdict import database.bests as bests import database.clblast as clblast def set_identifiers(database, group_by_attributes, identifier_name): """Sets a group-identifier based on a given set of attributes. Modifies the database but also returns a list of unique identifiers.""" identifiers = [] for section in database["sections"]: identifier = [] for attribute in group_by_attributes: if attribute in section: identifier.append(section[attribute]) section[identifier_name] = ";".join(identifier) identifiers.append(section[identifier_name]) return sorted(set(identifiers)) def remove_identifiers(database, identifier_name): """Removes an identifier from all sections in the database""" for section in database["sections"]: section.pop(identifier_name, None) def get_groups_by_identifier(database, group_identifiers, identifier_name): """Returns a list of (group, group_identifier) tuples based a previously made grouping""" groups = [] for group_identifier in group_identifiers: # Get all sections in this group group = [] for section in database["sections"]: if section[identifier_name] == group_identifier: group.append(section) groups.append((group, group_identifier)) return groups def add_default_sections(database, grouping, verbose, values_dict, condition, enable_warning): default_sections = [] # Groups the database by a certain grouping group_identifiers = set_identifiers(database, grouping, "group_identifier") groups = get_groups_by_identifier(database, group_identifiers, "group_identifier") # Loops over all groups for group, group_identifier in groups: # Computes the best parameters default_parameters = get_common_best_parameters(group, group_identifier, verbose, enable_warning) assert len(group) > 0 if condition(group[0]): # Stores all the section's data default_section = {} for attribute in group[0].keys(): if attribute != "results" and attribute != "group_identifier": default_section[attribute] = group[0][attribute] default_section["clblast_device_compute_units"] = 0 default_section["clblast_device_core_clock"] = 0 for key in values_dict.keys(): default_section[key] = values_dict[key] default_section["results"] = [{"time": 0.0, "parameters": default_parameters}] default_sections.append(default_section) return default_sections def calculate_defaults(database, verbose): """Sets defaults for devices of the same type/vendor""" default_sections = {"sections": []} # Groups the database by kernel, vendor and device architecture (e.g. AMD GPU "Fiji") architecture_group = clblast.GROUP_ATTRIBUTES + ["clblast_device_architecture"] architecture_defaults = add_default_sections(database, architecture_group, verbose, {"clblast_device_name": clblast.DEVICE_NAME_DEFAULT}, lambda entry: True, enable_warning=False) # Groups the database by kernel, vendor and device type (e.g. AMD GPU) device_defaults = add_default_sections(database, clblast.GROUP_ATTRIBUTES, verbose, {"clblast_device_name": clblast.DEVICE_NAME_DEFAULT, "clblast_device_architecture": clblast.DEVICE_ARCHITECTURE_DEFAULT}, lambda entry: entry["clblast_device_architecture"] != "", enable_warning=True) default_sections["sections"].extend(device_defaults) # Groups the database by kernel, vendor and device type (e.g. AMD GPU) - but not by arguments! # This is to check for mis-matched arguments in the database. Note: this is not a check on the # architecture defaults attributes = clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"] group_identifiers = set_identifiers(default_sections, attributes, "temp_identifier") groups = get_groups_by_identifier(default_sections, group_identifiers, "temp_identifier") for group, group_identifier in groups: if len(group) != 1: print("[ERROR] Entries for a single kernel with multiple argument values: " + str(group_identifier)) assert len(group) == 1 remove_identifiers(default_sections, "temp_identifier") # Adds the architecture defaults only after running the above check default_sections["sections"].extend(architecture_defaults) # Groups the database by kernel only group_identifiers = set_identifiers(database, clblast.KERNEL_ATTRIBUTES + ["kernel"], "group_identifier") groups = get_groups_by_identifier(database, group_identifiers, "group_identifier") # Loops over all groups for group, group_identifier in groups: # Computes the best parameters default_parameters = get_common_best_parameters(group, group_identifier, verbose, enable_warning=True) # Stores all the section's data assert len(group) > 0 default_section = {} for attribute in group[0].keys(): if attribute != "results" and attribute != "group_identifier": default_section[attribute] = group[0][attribute] default_section["clblast_device_name"] = clblast.DEVICE_NAME_DEFAULT default_section["clblast_device_architecture"] = clblast.DEVICE_ARCHITECTURE_DEFAULT default_section["clblast_device_vendor"] = clblast.VENDOR_DEFAULT default_section["clblast_device_type"] = clblast.DEVICE_TYPE_DEFAULT default_section["clblast_device_compute_units"] = 0 default_section["clblast_device_core_clock"] = 0 default_section["results"] = [{"time": 0.0, "parameters": default_parameters}] default_sections["sections"].append(default_section) # Database with both types of defaults only return default_sections def get_smallest_best_parameters(group): """Sets defaults based on the smallest values of all known entries. The average might be better for performance but some parameters might not be supported on other devices.""" # Counts the number of devices in this group assert len(group) > 0 # Find the smallest values of the parameters min_parameters = {} for section in group: assert len(section["results"]) > 0 minimum_time = min([result["time"] for result in section["results"]]) for result in section["results"]: if result["time"] == minimum_time: for parameter in result["parameters"]: if parameter in min_parameters: min_parameters[parameter] = min(min_parameters[parameter], result["parameters"][parameter]) else: min_parameters[parameter] = result["parameters"][parameter] return min_parameters def get_parameter_names(section): return [result["parameters"] for result in section["results"]] def get_common_best_parameters(group, group_identifier, verbose, enable_warning): """Sets defaults based on the best values of entries supported by all devices. This might cause a problem in case not every device was tuned with the same parameters. In that case it falls back to the above method to retrieve the smallest best execution time""" # Counts the number of devices in this group num_devices = len(group) assert num_devices > 0 # Inserts the relative execution times into the database for section in group: assert len(section["results"]) > 0 minimum_time = min([result["time"] for result in section["results"]]) for result in section["results"]: base_line = minimum_time if section["kernel"] != "gemm_kernel_selection" else 1.0 result["relative_time"] = result["time"] / base_line # Determine which parameters are available for all devices common_parameters = get_parameter_names(group[0]) # Parameters of the first section for i in range(1, num_devices): section_parameters = get_parameter_names(group[i]) common_parameters = [p for p in section_parameters if p in common_parameters] # Intersection of the parameters # Fall back to another method in case there are no shared entries at all across devices if len(common_parameters) == 0: if verbose: print("[database] No common kernels for: " + str(group_identifier) + " across all %d devices " % num_devices) # Computes the amount of devices with shared parameters parameters_count = defaultdict(int) for i in range(0, num_devices): for parameters in get_parameter_names(group[i]): parameters_count[str(parameters)] += 1 num_devices_common = max(parameters_count.values()) # Fall back method in case there are no shared entries at all across devices if num_devices_common == 1: if enable_warning: print("[database] Warning: No common kernels for: " + str(group_identifier) + " at all") smallest_best_parameters = get_smallest_best_parameters(group) if verbose: print("[database] " + str(group_identifier)) return smallest_best_parameters # Checks if perhaps there are many more shared parameters with a bit fewer devices num_parameters_common = defaultdict(int) for count in parameters_count.values(): if count != 1: num_parameters_common[str(count)] += 1 if num_parameters_common[str(num_devices_common - 1)] > num_parameters_common[str(num_devices_common)]: num_devices_common -= 1 if verbose: print("[database] Found %d common kernels for: " % num_parameters_common[str(num_devices_common)] + str(group_identifier) + " across %d out of %d devices " % (num_devices_common, num_devices)) # Populates the common parameters for parameters_string in parameters_count.keys(): count = parameters_count[parameters_string] if count == num_devices_common: parameters = ast.literal_eval(parameters_string) common_parameters.append(parameters) # Removes entries with parameters which are not common common_results = [] for section in group: for result in section["results"]: if result["parameters"] in common_parameters: common_results.append(result) # Retrieves the entries with the highest relative performance relative_best_parameters = bests.get_relative_bests(group_identifier, common_results, common_parameters, verbose) return relative_best_parameters CLBlast-1.6.3/scripts/database/database/io.py000066400000000000000000000102121463263031500207640ustar00rootroot00000000000000 # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import re import json try: from urllib.request import urlopen # Python 3 except ImportError: from urllib2 import urlopen # Python 2 def download_database(filename, database_url): """Downloads a database and saves it to disk""" print("[database] Downloading database from '" + database_url + "'...") database = urlopen(database_url) with open(filename, "wb") as f: f.write(database.read()) def load_database(filename): """Loads a database from disk""" print("[database] Loading database from '" + filename + "'") with open(filename) as f: database = json.load(f) return decompress_database(database) def save_database(database, filename): """Saves a database to disk""" compressed_db = compress_database(database) print("[database] Saving database to '" + filename + "'") with open(filename, "w") as f: json.dump(compressed_db, f, sort_keys=True, indent=2, separators=(',', ': ')) def compress_database(database): """Moves certain common fields up in the hierarchy, transforms dicts into lists""" new_sections = [] for section in database["sections"]: new_section = {} for field in section: if field == "results": parameter_names = [sorted(result["parameters"].keys()) for result in section["results"]] assert len(list(set([" ".join(p) for p in parameter_names]))) == 1 new_section["parameter_names"] = parameter_names[0] # they are all the same new_results = [[",".join([str(result["parameters"][p]) for p in new_section["parameter_names"]]), result["time"]] for result in section["results"]] new_section[field] = new_results elif field != "parameter_names": new_section[field] = section[field] new_sections.append(new_section) return {"sections": new_sections} def decompress_database(database): """Undo the above compression""" for section in database["sections"]: new_results = [] for result in section["results"]: parameters = {} for name, value in zip(section["parameter_names"], result[0].split(",")): parameters[name] = int(value) new_result = { "parameters": parameters, "time": result[1] } new_results.append(new_result) section["results"] = new_results return database def load_tuning_results(filename): """Loads JSON data from file and pre-processes it""" with open(filename) as f: json_data = json.load(f) # Removes the numbering following the kernel family name json_data["kernel_family"] = re.sub(r'_\d+', '', json_data["kernel_family"]) # Removes unnecessary data if json_data["best_kernel"]: del json_data["best_kernel"] if json_data["best_time"]: del json_data["best_time"] if json_data["best_parameters"]: del json_data["best_parameters"] # Adds the kernel name to the section instead of to the individual results assert len(json_data["results"]) > 0 json_data["kernel"] = json_data["results"][0]["kernel"] for result in json_data["results"]: assert json_data["kernel"] == result["kernel"] result.pop("kernel", None) # Removes the 'PRECISION' parameter from the individual results: it is redundant for result in json_data["results"]: assert json_data["precision"] == str(result["parameters"]["PRECISION"]) result["parameters"].pop("PRECISION", None) # Fixes the scalar argument values for value, replacement in zip(["2.00", "2.00+0.50i"], ["2.000000", "2+0.5i"]): for field in ["arg_alpha", "arg_beta"]: if field in json_data.keys() and json_data[field] == value: json_data[field] = replacement # All done return json_data CLBlast-1.6.3/scripts/generator/000077500000000000000000000000001463263031500164655ustar00rootroot00000000000000CLBlast-1.6.3/scripts/generator/generator.py000077500000000000000000001040401463263031500210270ustar00rootroot00000000000000#!/usr/bin/env python # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren # # This script automatically generates the bodies of the following files, creating the full CLBlast API interface and # implementation (C, C++, and reference BLAS wrappers): # clblast.h # clblast.cpp # clblast_c.h # clblast_c.cpp # clblast_cuda.h # clblast_cuda.cpp # clblast_netlib_c.h # clblast_netlib_c.cpp # wrapper_clblas.h # wrapper_cblas.h # pyclblast.pyx # It also generates the main functions for the correctness and performance tests as found in # test/correctness/routines/levelX/xYYYY.cpp # test/performance/routines/levelX/xYYYY.cpp # It also produces the API documentation found in doc/clblast.md import sys import os.path import argparse import generator.cpp as cpp import generator.doc as doc import generator.pyclblast as pyclblast from generator.routine import Routine from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU FILES = [ "/include/clblast.h", "/src/clblast.cpp", "/include/clblast_c.h", "/src/clblast_c.cpp", "/test/wrapper_clblas.hpp", "/test/wrapper_cblas.hpp", "/test/wrapper_cublas.hpp", "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", "/include/clblast_cuda.h", "/src/clblast_cuda.cpp", "/src/pyclblast/src/pyclblast.pyx" ] HEADER_LINES = [129, 21, 133, 24, 29, 45, 29, 66, 40, 96, 21, 341] FOOTER_LINES = [98, 57, 112, 275, 6, 6, 6, 9, 2, 41, 56, 37] HEADER_LINES_DOC = 0 FOOTER_LINES_DOC = 232 # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." ald_n = "The value of `a_ld` must be at least `n`." ald_k_one = "The value of `a_ld` must be at least `k + 1`." ald_kl_ku_one = "The value of `a_ld` must be at least `kl + ku + 1`." ald_transa_m_k = "When `(transpose_a == Transpose::kNo && layout == Layout::kColMajor) || (transpose_a == Transpose::kYes && layout == Layout::kRowMajor)`, then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `k`." ald_trans_n_k = "When `(transpose == Transpose::kNo && layout == Layout::kColMajor) || (transpose == Transpose::kYes && layout == Layout::kRowMajor)`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`." ald_side_m_n = "When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`." bld_m = "The value of `b_ld` must be at least `m`." bld_n = "The value of `b_ld` must be at least `n`." bld_transb_k_n = "When `(transpose_b == Transpose::kNo && layout == Layout::kColMajor) || (transpose_b == Transpose::kYes && layout == Layout::kRowMajor)`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`." bld_trans_n_k = "When `(transpose == Transpose::kNo && layout == Layout::kColMajor) || (transpose == Transpose::kYes && layout == Layout::kRowMajor)`, then `b_ld` must be at least `n`, otherwise `b_ld` must be at least `k`." cld_m = "The value of `c_ld` must be at least `m`." cld_n = "The value of `c_ld` must be at least `n`." # Helper functions to compute vector and matrix sizes def size_helper(condition, size_one, size_two, multiplier): length = "(" + condition + ")" + " ? " + size_one + " * " + multiplier + " : " + size_two + " * " + multiplier return length def layout_transpose_condition(prefix): return "(layout == CLBlastLayoutColMajor && " + prefix + "_transpose != CLBlastTransposeNo) || " +\ "(layout == CLBlastLayoutRowMajor && " + prefix + "_transpose == CLBlastTransposeNo)" # Different possibilities for the vector and matrix sizes xn = "n * x_inc" xm = "m * x_inc" yn = "n * y_inc" ym = "m * y_inc" zn = "n * z_inc" an = "n * a_ld" apn = "((n*(n+1)) / 2)" cn = "n * c_ld" xmn = size_helper("a_transpose != CLBlastTransposeNo", "m", "n", "x_inc") ynm = size_helper("a_transpose != CLBlastTransposeNo", "n", "m", "y_inc") amn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "a_ld") amns = size_helper("side == CLBlastSideLeft", "m", "n", "a_ld") amk = size_helper(layout_transpose_condition("a"), "m", "k", "a_ld") ank = size_helper(layout_transpose_condition("a"), "n", "k", "a_ld") ankab = size_helper(layout_transpose_condition("ab"), "n", "k", "a_ld") bkn = size_helper(layout_transpose_condition("b"), "k", "n", "b_ld") bnkab = size_helper(layout_transpose_condition("ab"), "n", "k", "b_ld") bmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "b_ld") bnma = size_helper(layout_transpose_condition("a"), "n", "m", "b_ld") cmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "c_ld") ammn = size_helper("layout == CLBlastLayoutRowMajor", "m", "((side == CLBlastSideLeft) ? m : n)", "a_ld") bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft) ? m : n)", "n", "b_ld") im = "height * width * channels" col = "height * width * channels" imb = "height * width * channels * batch_count" kernel = "kernel_h * kernel_w * num_kernels" result = "height_out * width_out * num_kernels * batch_count" # ================================================================================================== # Populates a list of routines im2col_constants = ["channels", "height", "width", "kernel_h", "kernel_w", "pad_h", "pad_w", "stride_h", "stride_w", "dilation_h", "dilation_w"] convgemm_constants = im2col_constants + ["num_kernels", "batch_count"] ROUTINES = [ [ # Level 1: vector-vector Routine(False, True, 0, False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []), Routine(False, True, 0, False, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []), Routine(False, True, 0, False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], [xn,yn], ["cos","sin"],"", "Apply givens plane rotation", "", []), Routine(False, True, 0, False, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [xn,yn,"1"], [], "", "Apply modified givens plane rotation", "", []), Routine(True, True, 0, False, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [xn,yn], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), Routine(True, True, 0, False, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], [xn], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), Routine(True, True, 0, False, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), Routine(True, True, 0, False, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), Routine(True, True, 0, False, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), Routine(True, True, 0, False, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), Routine(True, True, 0, False, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), Routine(True, True, 0, False, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [xn,"1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), Routine(True, True, 0, False, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [xn,"1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), Routine(True, False, 0, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [xn,"1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), Routine(True, True, 0, False, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of a maximum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), Routine(True, False, 0, False, "1", "amin", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of absolute minimum value in a vector (non-BLAS function)", "Finds the index of a minimum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.", []), Routine(True, False, 0, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of a maximum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), Routine(True, False, 0, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of a minimum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector Routine(True, True, 0, False, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), Routine(True, True, 0, False, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), Routine(True, True, 0, False, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), Routine(True, True, 0, False, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), Routine(True, True, 0, False, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), Routine(True, True, 0, False, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), Routine(True, True, 0, False, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), Routine(True, True, 0, False, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), Routine(True, True, 0, False, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), Routine(True, True, 0, False, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), Routine(True, True, 0, False, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), Routine(True, True, 0, False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []), Routine(False, True, 0, False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), Routine(False, True, 0, False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update Routine(True, True, 0, False, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), Routine(True, True, 0, False, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), Routine(True, True, 0, False, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), Routine(True, True, 0, False, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), Routine(True, True, 0, False, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), Routine(True, True, 0, False, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), Routine(True, True, 0, False, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), Routine(True, True, 0, False, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), Routine(True, True, 0, False, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), Routine(True, True, 0, False, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), Routine(True, True, 0, False, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), ], [ # Level 3: matrix-matrix Routine(True, True, 0, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), Routine(True, True, 0, False, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), Routine(True, True, 0, False, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), Routine(True, True, 0, False, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), Routine(True, True, 0, False, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), Routine(True, True, 0, False, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), Routine(True, True, 0, False, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), Routine(True, True, 0, False, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), Routine(True, True, 0, False, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.", []), ], [ # Level X: extra routines (not part of BLAS) # Special routines: Routine(True, True, 0, False, "x", "had", T, [S,D,C,Z,H], ["n"], [], ["x","y"], ["z"], [xn,yn,zn], ["alpha","beta"], "", "Element-wise vector product (Hadamard)", "Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_ are vectors and _alpha_ and _beta_ are scalar constants.", []), Routine(True, True, 0, False, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), Routine(True, True, 0, False, "x", "im2col", T, [S,D,C,Z,H], im2col_constants, ["kernel_mode"], ["im"], ["col"], [im,col], [""], "", "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix. Overwrites any existing values in the _col_ buffer", []), Routine(True, True, 0, False, "x", "col2im", T, [S,D,C,Z,H], im2col_constants, ["kernel_mode"], ["col"], ["im"], [col,im], [""], "", "Col2im function (non-BLAS function)", "Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix. Accumulates results on top of the existing values in the _im_ buffer.", []), Routine(True, True, 0, False, "x", "convgemm", T, [S,D,H], convgemm_constants, ["kernel_mode"], ["im","kernel"], ["result"], [imb,kernel,result],[""], "", "Batched convolution as GEMM (non-BLAS function)", "Integrates im2col and GEMM for batched 3D convolution, in which _im_ is the 4D input tensor (NCHW - batch-channelin-height-width), _kernel_ the 4D kernel weights tensor (KCHW - channelout-channelin-height-width), and _result_ the 4D output tensor (NCHW - batch-channelout-height-width).", []), # Batched routines: Routine(True, True, 1, False, "x", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []), Routine(True, True, 1, False, "x", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "Batched version of GEMM", "As GEMM, but multiple operations are batched together for better performance.", [ald_transa_m_k, bld_transb_k_n, cld_m]), Routine(True, True, 2, False, "x", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "StridedBatched version of GEMM", "As GEMM, but multiple strided operations are batched together for better performance.", [ald_transa_m_k, bld_transb_k_n, cld_m]), ]] def main(argv): # Parses the command-line arguments parser = argparse.ArgumentParser() parser.add_argument("clblast_root", help="Root of the CLBlast sources") parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script") cl_args = parser.parse_args(argv) library_root = cl_args.clblast_root # Checks whether the command-line arguments are valid; exists otherwise for f in FILES: if not os.path.isfile(library_root + f): print("[ERROR] The path '" + library_root + "' does not point to the root of the CLBlast library") sys.exit() # Iterates over all regular files to output for i in range(0, len(FILES)): # Stores the header and the footer of the original file with open(library_root + FILES[i]) as f: original = f.readlines() file_header = original[:HEADER_LINES[i]] file_footer = original[-FOOTER_LINES[i]:] # Re-writes the body of the file with open(library_root + FILES[i], "w", newline="\n") as f: body = "" levels = [1, 2, 3] if (i == 4 or i == 5 or i == 6) else [1, 2, 3, 4] for level in levels: if i not in [11]: body += cpp.LEVEL_SEPARATORS[level - 1] + "\n" for routine in ROUTINES[level - 1]: if i == 0: body += cpp.clblast_h(routine) if i == 1: body += cpp.clblast_cc(routine) if i == 2: body += cpp.clblast_c_h(routine) if i == 3: body += cpp.clblast_c_cc(routine) if i == 4: body += cpp.wrapper_clblas(routine) if i == 5: body += cpp.wrapper_cblas(routine) if i == 6: body += cpp.wrapper_cublas(routine) if i == 7: if routine.batched == 0 and routine.name not in ["convgemm"]: body += cpp.clblast_netlib_c_h(routine) if i == 8: if routine.batched == 0 and routine.name not in ["convgemm"]: body += cpp.clblast_netlib_c_cc(routine) if i == 9: body += cpp.clblast_h(routine, cuda=True) if i == 10: body += cpp.clblast_cc(routine, cuda=True) if i == 11: body += pyclblast.generate_pyx(routine) f.write("".join(file_header)) f.write(body) f.write("".join(file_footer)) # Outputs all the test implementations for level in [1, 2, 3, 4]: for routine in ROUTINES[level - 1]: if routine.has_tests: level_string = cpp.LEVEL_NAMES[level - 1] routine_suffix = "level" + level_string + "/x" + routine.lowercase_name() + ".cpp" # Correctness tests filename = library_root + "/test/correctness/routines/" + routine_suffix with open(filename, "w", newline="\n") as f: f.write(cpp.HEADER + "\n") f.write(cpp.correctness_test(routine, level_string)) f.write(cpp.FOOTER) # Performance tests filename = library_root + "/test/performance/routines/" + routine_suffix with open(filename, "w", newline="\n") as f: f.write(cpp.HEADER + "\n") f.write(cpp.performance_test(routine, level_string)) f.write(cpp.FOOTER) # API documentation filename = cl_args.clblast_root + "/doc/api.md" # Stores the header and the footer of the original documentation file with open(filename) as f: original = f.readlines() file_header = original[:HEADER_LINES_DOC] file_footer = original[-FOOTER_LINES_DOC:] # Outputs the API documentation with open(filename, "w", newline="\n") as f: # Outputs the header f.write("".join(file_header)) doc_header = doc.header() f.write(doc_header) # Generates the documentation for each routine for level in [1, 2, 3, 4]: for routine in ROUTINES[level - 1]: if routine.implemented: doc_routine = doc.generate(routine) f.write(doc_routine) # Outputs the footer f.write("".join(file_footer)) if __name__ == '__main__': main(sys.argv[1:]) CLBlast-1.6.3/scripts/generator/generator/000077500000000000000000000000001463263031500204535ustar00rootroot00000000000000CLBlast-1.6.3/scripts/generator/generator/__init__.py000066400000000000000000000000001463263031500225520ustar00rootroot00000000000000CLBlast-1.6.3/scripts/generator/generator/convert.py000066400000000000000000000065641463263031500225200ustar00rootroot00000000000000 # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren def precision_to_full_name(x): """Translates an option name to a CLBlast data-type""" return { 'H': "Half", 'S': "Single", 'D': "Double", 'C': "ComplexSingle", 'Z': "ComplexDouble", }[x] def option_to_clblast(x): """Translates an option name to a CLBlast data-type""" return { 'layout': "Layout", 'a_transpose': "Transpose", 'b_transpose': "Transpose", 'ab_transpose': "Transpose", 'side': "Side", 'triangle': "Triangle", 'diagonal': "Diagonal", 'kernel_mode': "KernelMode", }[x] def option_to_clblas(x): """As above, but for clBLAS data-types""" return { 'layout': "clblasOrder", 'a_transpose': "clblasTranspose", 'b_transpose': "clblasTranspose", 'ab_transpose': "clblasTranspose", 'side': "clblasSide", 'triangle': "clblasUplo", 'diagonal': "clblasDiag", }[x] def option_to_cblas(x): """As above, but for CBLAS data-types""" return { 'layout': "CBLAS_ORDER", 'a_transpose': "CBLAS_TRANSPOSE", 'b_transpose': "CBLAS_TRANSPOSE", 'ab_transpose': "CBLAS_TRANSPOSE", 'side': "CBLAS_SIDE", 'triangle': "CBLAS_UPLO", 'diagonal': "CBLAS_DIAG", }[x] def option_to_cublas(x): """As above, but for clBLAS data-types""" return { 'layout': "Layout", 'a_transpose': "cublasOperation_t", 'b_transpose': "cublasOperation_t", 'ab_transpose': "cublasOperation_t", 'side': "cublasSideMode_t", 'triangle': "cublasFillMode_t", 'diagonal': "cublasDiagType_t", }[x] def option_to_documentation(x): """Translates an option name to a documentation string""" return { 'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.", 'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", 'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", 'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", 'side': "The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).", 'triangle': "The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).", 'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.", 'kernel_mode': "The kernel mode, either `KernelMode::kCrossCorrelation` for the normal mode, or `KernelMode::kConvolution` for the convolution mode that flips a kernel along `h` and `w` axes.", }[x] CLBlast-1.6.3/scripts/generator/generator/cpp.py000066400000000000000000000523371463263031500216210ustar00rootroot00000000000000 # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import generator.datatype as datatype import generator.convert as convert NL = "\n" SEPARATOR = "// =================================================================================================" # Separators for the BLAS levels LEVEL_SEPARATORS = [ NL + SEPARATOR + NL + "// BLAS level-1 (vector-vector) routines" + NL + SEPARATOR, NL + SEPARATOR + NL + "// BLAS level-2 (matrix-vector) routines" + NL + SEPARATOR, NL + SEPARATOR + NL + "// BLAS level-3 (matrix-matrix) routines" + NL + SEPARATOR, NL + SEPARATOR + NL + "// Extra non-BLAS routines (level-X)" + NL + SEPARATOR ] # Names of the level sub-folders LEVEL_NAMES = ["1", "2", "3", "x"] # Main header/footer for source files FOOTER = NL + SEPARATOR + NL HEADER = NL + SEPARATOR + """ // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // """ + SEPARATOR + NL def clblast_h(routine, cuda=False): """The C++ API header (.h)""" result = NL + "// " + routine.description + ": " + routine.short_names() + NL result += routine.routine_header_cpp(12, " = nullptr", cuda) + ";" + NL return result def clblast_cc(routine, cuda=False): """The C++ API implementation (.cpp)""" indent1 = " " * (15 + routine.length()) result = NL + "// " + routine.description + ": " + routine.short_names() + NL if routine.implemented: result += routine.routine_header_cpp(12, "", cuda, implementation=True) + " {" + NL result += " try {" + NL if cuda: result += " const auto context_cpp = Context(context);" + NL result += " const auto device_cpp = Device(device);" + NL result += " auto queue_cpp = Queue(context_cpp, device_cpp);" + NL else: result += " auto queue_cpp = Queue(*queue);" + NL event = "nullptr" if cuda else "event" result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, " + event + ");" + NL if routine.batched == 1: result += " " + (NL + " ").join(routine.batched_transform_to_cpp()) + NL if routine.temp_buffer: null = "0" if cuda else "nullptr" result += " const auto temp_buffer_provided = temp_buffer != " + null + ";\n" result += " auto temp_buffer_cpp = temp_buffer_provided ? Buffer(temp_buffer) : Buffer(" + null + ");\n" result += " routine.Do" + routine.capitalized_name() + "(" result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()]) if routine.temp_buffer: result += ",\n" + indent1 + "temp_buffer_cpp, temp_buffer_provided" result += ");" + NL result += " return StatusCode::kSuccess;" + NL result += " } catch (...) { return DispatchException(); }" + NL else: result += routine.routine_header_type_cpp(12, cuda) + " {" + NL result += " return StatusCode::kNotImplemented;" + NL result += "}" + NL for flavour in routine.flavours: indent2 = " " * (34 + routine.length() + len(flavour.template)) result += "template StatusCode PUBLIC_API " + routine.capitalized_name() + "<" + flavour.template + ">(" arguments = routine.arguments_type(flavour) if cuda: arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments] result += ("," + NL + indent2).join([a for a in arguments]) result += "," + NL + indent2 if cuda: result += "const CUcontext, const CUdevice" if routine.temp_buffer: result += ", CUdeviceptr" else: result += "cl_command_queue*, cl_event*" if routine.temp_buffer: result += ", cl_mem" result += ");" + NL return result def clblast_c_h(routine): """The C API header (.h)""" result = NL + "// " + routine.description + ": " + routine.short_names() + NL for flavour in routine.flavours: result += routine.routine_header_c(flavour, 38, " PUBLIC_API") + ";" + NL return result def clblast_c_cc(routine): """The C API implementation (.cpp)""" result = NL + "// " + routine.name.upper() + NL for flavour in routine.flavours: template = "<" + flavour.template + ">" if routine.no_scalars() else "" indent = " " * (16 + routine.length() + len(template)) result += routine.routine_header_c(flavour, 27, "") + " {" + NL if routine.batched == 1: result += " " + (NL + " ").join(routine.batched_transform_to_complex(flavour)) + NL result += " try {" + NL result += " return static_cast(" + NL result += " clblast::" + routine.capitalized_name() + template + "(" result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)]) result += "," + NL + indent + "queue, event)" + NL result += " );" + NL result += " } catch (...) { return static_cast(clblast::DispatchExceptionForC()); }" + NL result += "}" + NL return result def clblast_netlib_c_h(routine): """The Netlib CBLAS API header (.h)""" result = NL + "// " + routine.description + ": " + routine.short_names() + NL for flavour in routine.flavours: if flavour.precision_name in ["S", "D", "C", "Z"]: result += routine.routine_header_netlib(flavour, 20, " PUBLIC_API") + ";" + NL return result def clblast_netlib_c_cc(routine): """The Netlib CBLAS API implementation (.cpp)""" result = NL + "// " + routine.name.upper() + NL for flavour in routine.flavours: # There is a version available in CBLAS if flavour.precision_name in ["S", "D", "C", "Z"]: template = "<" + flavour.template + ">" if routine.no_scalars() else "" name_postfix = "_sub" if routine.name in routine.routines_scalar_no_return() else "" indent = " " * (21 + routine.length() + len(template)) result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL # Initialize OpenCL result += " OPTIONAL_STATIC auto device = get_device();" + NL result += " OPTIONAL_STATIC auto context = clblast::Context(device);" + NL result += " auto queue = clblast::Queue(context, device);" + NL # Set alpha and beta result += "".join(" " + s + NL for s in routine.scalar_create_cpp(flavour)) # Copy data structures to the device for i, name in enumerate(routine.inputs + routine.outputs): result += " " + routine.set_size(name, routine.buffer_sizes[i]) + NL for i, name in enumerate(routine.inputs + routine.outputs): buffer_type = routine.get_buffer_type(name, flavour) result += " " + routine.create_buffer(name, buffer_type) + NL if name in routine.scalar_buffers_second_non_pointer(): result += " " + buffer_type + " " + name + "_vec[1]; " + name + "_vec[0] = " + name + ";" + NL for name in routine.inputs + routine.outputs: if name not in routine.scalar_buffers_first(): prefix = "" if name in routine.outputs else "const " buffer_type = routine.get_buffer_type(name, flavour) result += " " + routine.write_buffer(name, prefix + buffer_type) + NL # The function call result += " auto queue_cl = queue();" + NL result += " auto s = clblast::" + routine.name.capitalize() + template + "(" result += ("," + NL + indent).join([a for a in routine.arguments_netlib(flavour, indent)]) result += "," + NL + indent + "&queue_cl);" + NL # Error handling result += " if (s != clblast::StatusCode::kSuccess) {" + NL result += " throw std::runtime_error(\"CLBlast returned with error code \" + clblast::ToString(s));" + NL result += " }" + NL # Copy back and clean-up for name in routine.outputs: if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return(): buffer_type = routine.get_buffer_type(name, flavour) result += " " + buffer_type + " " + name + "[" + name + "_size];" + NL for name in routine.outputs: buffer_type = routine.get_buffer_type(name, flavour) result += " " + routine.read_buffer(name, buffer_type) + NL for name in routine.outputs: if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return(): result += " return " + name + "[0]" if flavour.buffer_type in ["float2", "double2"]: if name not in routine.index_buffers(): result += ".real()" result += ";" + NL result += "}" + NL return result def wrapper_clblas(routine): """The wrapper to the reference clBLAS routines (for performance/correctness testing)""" result = "" if routine.has_tests: result += NL + "// Forwards the clBLAS calls for %s" % routine.short_names_tested() + NL if routine.no_scalars(): result += routine.routine_header_wrapper_clblas(routine.template, True, 21) + ";" + NL for flavour in routine.flavours: result += routine.routine_header_wrapper_clblas(flavour, False, 21) + " {" + NL # There is a version available in clBLAS if flavour.precision_name in ["S", "D", "C", "Z"]: indent = " " * (17 + routine.length()) arguments = routine.arguments_wrapper_clblas(flavour) if routine.scratch: result += " auto queue = Queue(queues[0]);" + NL result += " auto context = queue.GetContext();" + NL result += " auto scratch_buffer = Buffer<" + flavour.template + ">" result += "(context, " + routine.scratch + ");" + NL arguments += ["scratch_buffer()"] result += " return clblas" + flavour.name + routine.name + "(" result += ("," + NL + indent).join([a for a in arguments]) result += "," + NL + indent + "num_queues, queues, num_wait_events, wait_events, events);" # There is no clBLAS available, forward the call to one of the available functions else: # Half-precision indent = " " * (24 + routine.length()) # Convert to float (note: also integer buffers are stored as half/float) for buf in routine.inputs + routine.outputs: if buf not in routine.index_buffers(): result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL else: result += " auto " + buf + "_buffer_bis = " + buf + "_buffer;" + NL # Call the float routine result += " auto status = clblasX" + routine.name + "(" result += ("," + NL + indent).join([a for a in routine.arguments_half()]) result += "," + NL + indent + "num_queues, queues, num_wait_events, wait_events, events);" result += NL # Convert back to half for buf in routine.outputs: if buf not in routine.index_buffers(): result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL result += " return status;" # Complete result += NL + "}" + NL return result def wrapper_cblas(routine): """The wrapper to the reference CBLAS routines (for performance/correctness testing)""" result = "" if routine.has_tests: result += NL + "// Forwards the Netlib BLAS calls for %s" % routine.short_names_tested() + NL for flavour in routine.flavours: result += routine.routine_header_wrapper_cblas(flavour, 12) + " {" + NL # There is a version available in CBLAS if flavour.precision_name in ["S", "D", "C", "Z"]: indent = " " * (10 + routine.length()) arguments = routine.arguments_wrapper_cblas(flavour) # Complex scalars for scalar in routine.scalars: if flavour.is_complex(scalar): result += " const auto " + scalar + "_array = std::vector<" + flavour.buffer_type[:-1] + ">" result += "{" + scalar + ".real(), " + scalar + ".imag()};" + NL # Special case for scalar outputs assignment = "" postfix, postpostfix = "", "" end_of_line = "" extra_argument = "" for output_buffer in routine.outputs: if output_buffer in routine.scalar_buffers_first(): if flavour in [datatype.C, datatype.Z]: postfix += "_sub" indent += " " extra_argument += "," + NL + indent extra_argument += "reinterpret_cast" extra_argument += "(&" + output_buffer + "_buffer[" + output_buffer + "_offset])" else: assignment = output_buffer + "_buffer[" + output_buffer + "_offset]" if flavour.name in ["Sc", "Dz"]: assignment += ".real(" end_of_line += ")" else: assignment += " = " indent += " " * len(assignment) result += " " + assignment + "cblas_" + flavour.name.lower() + routine.name + postfix + "(" result += ("," + NL + indent).join([a for a in arguments]) result += extra_argument + end_of_line + ")" + postpostfix + ";" + NL # There is no CBLAS available, forward the call to one of the available functions else: # Half-precision indent = " " * (9 + routine.length()) # Convert to float (note: also integer buffers are stored as half/float) for buf in routine.inputs + routine.outputs: if buf not in routine.index_buffers(): result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer);" + NL else: result += " auto " + buf + "_buffer_bis = " + buf + "_buffer;" + NL # Call the float routine result += " cblasX" + routine.name + "(" result += ("," + NL + indent).join([a for a in routine.arguments_half()]) result += ");" + NL # Convert back to half for buf in routine.outputs: if buf not in routine.index_buffers(): result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis);" + NL # Complete result += "}" + NL return result def wrapper_cublas(routine): """The wrapper to the reference cuBLAS routines (for performance/correctness testing)""" result = "" if routine.has_tests: result += NL + "// Forwards the cuBLAS calls for %s" % routine.short_names_tested() + NL if routine.no_scalars(): result += routine.routine_header_wrapper_cublas(routine.template, True, 23) + ";" + NL for flavour in routine.flavours: result += routine.routine_header_wrapper_cublas(flavour, False, 23) + " {" + NL # There is a version available in cuBLAS if flavour.precision_name in ["S", "D", "C", "Z"]: indent = " " * (24 + routine.length()) arguments = routine.arguments_wrapper_cublas(flavour) # Handles row-major if routine.has_layout(): result += " if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }" + NL # Complex scalars for scalar in routine.scalars: if flavour.is_complex(scalar): cuda_complex = "cuDoubleComplex" if flavour.precision_name == "Z" else "cuComplex" result += " " + cuda_complex + " " + scalar + "_cuda;" + NL result += " " + scalar + "_cuda.x = " + scalar + ".real();" + NL result += " " + scalar + "_cuda.y = " + scalar + ".imag();" + NL # Calls the cuBLAS routine result += " auto status = cublas" + flavour.name_cublas() + routine.name + "(handle, " result += ("," + NL + indent).join([a for a in arguments]) + ");" + NL result += " cudaDeviceSynchronize();" + NL result += " return status;" # There is no cuBLAS available, forward the call to one of the available functions else: # Half-precision result += " return CUBLAS_STATUS_NOT_SUPPORTED;" # indent = " " * (24 + routine.length()) # # Convert to float (note: also integer buffers are stored as half/float) # for buf in routine.inputs + routine.outputs: # result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL # # Call the float routine # result += " return cublasX" + routine.name + "(handle," # result += ("," + NL + indent).join([a for a in routine.arguments_half()]) + ");" + NL # result += " cudaDeviceSynchronize();" + NL # result += " return status;" # # Convert back to half # for buf in routine.outputs: # result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL # result += " return status;" # Complete result += NL + "}" + NL return result def performance_test(routine, level_string): """Generates the body of a performance test for a specific routine""" result = "" result += "#include \"test/performance/client.hpp\"" + NL result += "#include \"test/routines/level" + level_string + "/x" + routine.lowercase_name() + ".hpp\"" + NL + NL result += "// Main function (not within the clblast namespace)" + NL result += "int main(int argc, char *argv[]) {" + NL result += " const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);" + NL default = convert.precision_to_full_name(routine.flavours[0].precision_name) result += " switch(clblast::GetPrecision(command_line_args, clblast::Precision::k" + default + ")) {" + NL for precision in ["H", "S", "D", "C", "Z"]: result += " case clblast::Precision::k" + convert.precision_to_full_name(precision) + ":" found = False for flavour in routine.flavours: if flavour.precision_name == precision: extra_template_argument = "0, " if routine.name == "gemm" and routine.batched == 0 else "" result += NL + " clblast::RunClient # Short-hands for data-types D_HALF = "half" D_FLOAT = "float" D_DOUBLE = "double" D_FLOAT2 = "float2" D_DOUBLE2 = "double2" D_HALF_OPENCL = "cl_half" D_FLOAT2_OPENCL = "cl_float2" D_DOUBLE2_OPENCL = "cl_double2" class DataType: """Class holding data-type and precision information""" def __init__(self, precision_name, name, template, scalars, buffer_type): self.precision_name = precision_name self.name = name self.template = template self.alpha_cpp = scalars[0] self.beta_cpp = scalars[1] self.alpha_cl = scalars[2] self.beta_cl = scalars[3] self.buffer_type = buffer_type def use_alpha(self, postfix=""): """Outputs the name of the data-type (alpha/beta), possibly transforming into the right type""" if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]: return self.alpha_cpp + "{alpha" + postfix + ".s[0], alpha" + postfix + ".s[1]}" return "alpha" + postfix def use_beta(self, postfix=""): """As above, but for beta instead of alpha""" if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]: return self.beta_cpp + "{beta" + postfix + ".s[0], beta" + postfix + ".s[1]}" return "beta" + postfix def use_alpha_opencl(self): """As above, but the transformation is in the opposite direction""" if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]: return self.alpha_cl + "{{alpha.real(), alpha.imag()}}" return "alpha" def use_beta_opencl(self): """As above, but for beta instead of alpha""" if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]: return self.beta_cl + "{{beta.real(), beta.imag()}}" return "beta" def use_alpha_clblast(self): """Transforms a Netlib CBLAS parameter to CLBlast style""" if self.alpha_cpp == D_FLOAT2: return self.alpha_cpp + "{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}" elif self.alpha_cpp == D_DOUBLE2: return self.alpha_cpp + "{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}" return "alpha" def use_beta_clblast(self): """As above, but for beta instead of alpha""" if self.beta_cpp == D_FLOAT2: return self.beta_cpp + "{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}" elif self.beta_cpp == D_DOUBLE2: return self.beta_cpp + "{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}" return "beta" def test_template(self, extra_template_argument): """Returns the template as used in the correctness/performance tests""" buffer_type = "clblast::" + self.buffer_type if self.is_non_standard() else self.buffer_type beta_cpp = "clblast::" + self.beta_cpp if self.beta_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2] else self.beta_cpp if self.buffer_type != self.beta_cpp: return "<" + extra_template_argument + buffer_type + "," + self.beta_cpp + ">, " + buffer_type + ", " + beta_cpp return "<" + extra_template_argument + buffer_type + ">, " + buffer_type + ", " + beta_cpp def is_complex(self, scalar): """Current scalar is complex""" return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or (scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2])) def is_non_standard(self): """Current type is of a non-standard type""" return self.buffer_type in [D_HALF, D_FLOAT2, D_DOUBLE2] def name_cublas(self): if "i" in self.name: return "I" + self.name[1].lower() return self.name # Regular data-types H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF) # half (16) S = DataType("S", "S", D_FLOAT, [D_FLOAT] * 4, D_FLOAT) # single (32) D = DataType("D", "D", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE) # double (64) C = DataType("C", "C", D_FLOAT2, [D_FLOAT2] * 2 + [D_FLOAT2_OPENCL] * 2, D_FLOAT2) # single-complex (3232) Z = DataType("Z", "Z", D_DOUBLE2, [D_DOUBLE2] * 2 + [D_DOUBLE2_OPENCL] * 2, D_DOUBLE2) # double-complex (6464) # Special cases Sc = DataType("C", "Sc", D_FLOAT2, [D_FLOAT2] * 4, D_FLOAT2) # As C, but with real output Dz = DataType("Z", "Dz", D_DOUBLE2, [D_DOUBLE2] * 4, D_DOUBLE2) # As Z, but with real output iH = DataType("H", "iH", D_HALF, [D_HALF] * 4, D_HALF) # As H, but with integer output iS = DataType("S", "iS", D_FLOAT, [D_FLOAT] * 4, D_FLOAT) # As S, but with integer output iD = DataType("D", "iD", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE) # As D, but with integer output iC = DataType("C", "iC", D_FLOAT2, [D_FLOAT2] * 2 + [D_FLOAT2_OPENCL] * 2, D_FLOAT2) # As C, but with integer output iZ = DataType("Z", "iZ", D_DOUBLE2, [D_DOUBLE2] * 2 + [D_DOUBLE2_OPENCL] * 2, D_DOUBLE2) # As Z, but with int output Css = DataType("C", "C", D_FLOAT, [D_FLOAT, D_FLOAT, D_FLOAT, D_FLOAT], D_FLOAT2) # As C, but with constants from S Zdd = DataType("Z", "Z", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE2) # As Z, but with constants from D Ccs = DataType("C", "C", D_FLOAT2 + "," + D_FLOAT, [D_FLOAT2, D_FLOAT, D_FLOAT2_OPENCL, D_FLOAT], D_FLOAT2) # As C, but with one constant from S Zzd = DataType("Z", "Z", D_DOUBLE2 + "," + D_DOUBLE, [D_DOUBLE2, D_DOUBLE, D_DOUBLE2_OPENCL, D_DOUBLE], D_DOUBLE2) # As Z, but with one constant from D # C++ template data-types T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T") # regular routine Tc = DataType("Tc", "typename T", "std::complex,T", ["T", "T", "T", "T"], "std::complex") # for herk TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k CLBlast-1.6.3/scripts/generator/generator/doc.py000066400000000000000000000036371463263031500216030ustar00rootroot00000000000000 # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren NL = "\n" def header(): """Generates the header for the API documentation""" result = "CLBlast: API reference" + NL result += "================" + NL + NL + NL return result def generate(routine): """Generates the API documentation for a given routine""" result = "" # Routine header result += "x" + routine.upper_name() + ": " + routine.description + NL result += "-------------" + NL + NL result += routine.details + NL + NL # Routine API result += "C++ API:" + NL result += "```" + NL result += routine.routine_header_cpp(12, "") + NL result += "```" + NL + NL result += "C API:" + NL result += "```" + NL for flavour in routine.flavours: result += routine.routine_header_c(flavour, 27, "") + NL result += "```" + NL + NL # Routine arguments result += "Arguments to " + routine.upper_name() + ":" + NL + NL for argument in routine.arguments_doc(): result += "* " + argument + NL result += "* `cl_command_queue* queue`: " result += "Pointer to an OpenCL command queue associated with a context and device to execute the routine on." + NL result += "* `cl_event* event`: " result += "Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). " result += "This is an optional argument." + NL + NL # Routine requirements if len(routine.requirements_doc()) > 0: result += "Requirements for " + routine.upper_name() + ":" + NL + NL for requirement in routine.requirements_doc(): result += "* " + requirement + NL result += NL # Routine footer result += NL + NL return result CLBlast-1.6.3/scripts/generator/generator/pyclblast.py000066400000000000000000000224671463263031500230350ustar00rootroot00000000000000 # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren import os NL = '\n' SEPARATOR = "####################################################################################################" def to_np_dtype(flavour): return { "S": "float32", "D": "float64", "C": "complex64", "Z": "complex128", "H": "float16", }[flavour.precision_name] def cl_type(flavour): return { "S": "cl_float", "D": "cl_double", "C": "cl_float2", "Z": "cl_double2", "H": "cl_half", }[flavour.precision_name] def scalar_cython_conversion(scalar, flavour): scalar_type = flavour.alpha_cl if scalar == "alpha" else flavour.beta_cl if scalar_type == "float": return "" + scalar if scalar_type == "double": return "" + scalar if scalar_type in ["cl_float2", "float2"]: return "cl_float2(x=" + scalar + ".real,y=" + scalar + ".imag)" if scalar_type in ["cl_double2", "double2"]: return "cl_double2(x=" + scalar + ".real,y=" + scalar + ".imag)" if scalar_type in ["cl_half", "half"]: return "val_to_half(" + scalar + ")" raise RuntimeError("Could not convert flavour '%s:%s'" % (flavour.precision_name, scalar_type)) def generate_pyx(routine): result = "" if routine.implemented and routine.plain_name() and routine.level in ["1", "2a", "2b", "3", "x"]: if routine.level == "x" and routine.batched == 0: return result # level-X routines that are non-batched are not supported at the moment indent = " " result += SEPARATOR + NL result += "# " + routine.description + ": " + routine.short_names() + NL result += SEPARATOR + NL result += NL # Reference C definition result += "cdef extern from \"clblast_c.h\":" + NL np_dtypes = [] for flavour in routine.flavours: if flavour.precision_name in ["S", "D", "C", "Z", "H"]: result += indent + "CLBlastStatusCode CLBlast" + flavour.name + routine.plain_name() + "(" result += ", ".join(routine.arguments_def_c(flavour)) + "," result += "cl_command_queue* queue, cl_event* event)" + NL np_dtypes.append(to_np_dtype(flavour)) result += "" + NL # Function definition buffers = routine.inputs[:] + routine.outputs[:] result += "def " + routine.plain_name() + "(queue, " result += ", ".join(routine.arguments_python()) + "):" + NL # Documentation result += indent + "\"\"\"" + NL result += indent + "x" + routine.upper_name() + ": " + routine.description + NL result += indent + "\"\"\"" + NL result += NL # Data types and checks int_buff = [] other_buff = [] for buf in buffers: if buf in routine.index_buffers(): int_buff.append(buf) else: other_buff.append(buf) result += indent + "dtype = check_dtype([" + ", ".join(other_buff) + "], " result += "[" + ", ".join(['"%s"' % d for d in np_dtypes]) + "])" + NL if int_buff: result += indent + "check_dtype([" + ", ".join(int_buff) + "], " result += "[" + ", ".join(['"uint16", "uint32", "uint64"']) + "])" + NL for buf in buffers: if buf in routine.buffers_vector(): result += indent + "check_vector(" else: result += indent + "check_matrix(" result += buf + ", \"" + buf + "\")" + NL result += NL # Batched checks if routine.batched == 1: # batched but not strided-batched lists = [b + "_offsets" for b in buffers] + [s + "s" for s in routine.scalars] result += indent + "if " + " != ".join(["len(" + l + ")" for l in lists]) + ":" + NL result += indent + indent + "raise RuntimeError(\"PyCLBlast: 'CLBlastX" + routine.plain_name() + "' failed: length of batch-sized arguments " + ", ".join(lists) + " should be equal\")" + NL result += indent + "batch_count = len(" + lists[0] + ")" + NL result += NL # Batched list to pointer conversions for buf in buffers: result += indent + "cdef size_t *" + buf + "_offsets_c = PyMem_Malloc(batch_count * sizeof(size_t))" + NL result += indent + "for i in range(batch_count):" + NL result += indent + indent + "" + buf + "_offsets_c[i] = " + buf + "_offsets[i]" + NL for scalar in routine.scalars: result += indent + "cdef void *" + scalar + "s_c = PyMem_Malloc(batch_count * sizeof(dtype_size[dtype]))" + NL result += indent + "for i in range(batch_count):" + NL if_prefix = "" for flavour in routine.flavours: if flavour.precision_name in ["S", "D", "C", "Z", "H"]: np_dtype = to_np_dtype(flavour) result += indent + indent + if_prefix + "if dtype == np.dtype(\"" + np_dtype + "\"):" + NL scalar_converted = scalar_cython_conversion(scalar + "s[i]", flavour) result += indent + indent + indent + "(<" + cl_type(flavour) + "*>" + scalar + "s_c)[i] = " + scalar_converted + NL if_prefix = "el" result += NL # Buffer transformation for buf in buffers: result += indent + "cdef cl_mem " + buf + "_buffer = " + buf + ".base_data.int_ptr" + NL result += NL result += indent + "cdef cl_command_queue command_queue = queue.int_ptr" + NL result += indent + "cdef cl_event event = NULL" + NL for option in routine.options: if option == "a_transpose": result += indent + "a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo" + NL if option == "b_transpose": result += indent + "b_transpose = CLBlastTransposeYes if b_transp else CLBlastTransposeNo" + NL if option == "ab_transpose": result += indent + "ab_transpose = CLBlastTransposeYes if ab_transp else CLBlastTransposeNo" + NL if option == "side": result += indent + "side = CLBlastSideRight if right_side else CLBlastSideLeft" + NL if option == "triangle": result += indent + "triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper" + NL if option == "diagonal": result += indent + "diagonal = CLBlastDiagonalUnit if unit_diagonal else CLBlastDiagonalNonUnit" + NL result += "" + NL result += indent + "cdef CLBlastStatusCode err" + NL if_prefix = "" for flavour in routine.flavours: if flavour.precision_name in ["S", "D", "C", "Z", "H"]: np_dtype = to_np_dtype(flavour) if routine.batched != 1: # regular or strided-batched argument_names = [x. replace("layout", "CLBlastLayoutRowMajor"). replace("alpha", scalar_cython_conversion("alpha", flavour)). replace("beta", scalar_cython_conversion("beta", flavour)) for x in routine.arguments()] else: # batched but not strided-batched argument_names = [x. replace("layout", "CLBlastLayoutRowMajor"). replace("_cpp", "_c"). replace("_offsets", "_offsets_c"). replace("alphas_c", "<" + cl_type(flavour) + "*>alphas_c"). replace("betas_c", "<" + cl_type(flavour) + "*>betas_c") for x in routine.arguments()] if routine.batched > 0: argument_names.append("batch_count") result += indent + if_prefix + "if dtype == np.dtype(\"" + np_dtype + "\"):" + NL result += indent + indent + "err = CLBlast" + flavour.name + routine.plain_name() result += "(" + ", ".join(argument_names) + ", &command_queue, &event)" + NL if_prefix = "el" result += indent + "else:" + NL result += indent + indent + "raise ValueError(\"PyCLBlast: Unrecognized data-type '%s'\" % dtype)" + NL result += NL # Cleaning up if routine.batched == 1: # batched but not strided-batched for array in [b + "_offset" for b in buffers] + routine.scalars: result += indent + "PyMem_Free(" + array + "s_c)" + NL result += NL result += indent + "if err != CLBlastSuccess:" + NL result += indent + indent + "raise RuntimeError(\"PyCLBlast: 'CLBlastX" + routine.plain_name() + "' failed: %s\" % get_status_message(err))" + NL result += indent + "return cl.Event.from_int_ptr(event)" + NL result += NL return result CLBlast-1.6.3/scripts/generator/generator/routine.py000066400000000000000000001334471463263031500225260ustar00rootroot00000000000000 # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the # PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren from itertools import chain import generator.convert as convert class Routine: """Class holding routine-specific information (e.g. name, which arguments, which precisions)""" def __init__(self, implemented, has_tests, batched_strided, temp_buffer, level, name, template, flavours, sizes, options, inputs, outputs, buffer_sizes, scalars, scratch, description, details, requirements): self.implemented = implemented self.has_tests = has_tests self.batched = batched_strided self.temp_buffer = temp_buffer self.level = level self.name = name self.template = template self.flavours = flavours self.sizes = sizes self.options = options self.inputs = inputs self.outputs = outputs self.buffer_sizes = buffer_sizes self.scalars = scalars self.scratch = scratch # Scratch buffer (e.g. for xDOT) self.description = description self.details = details self.requirements = requirements def lowercase_name(self): postfix = "strided" if self.batched == 2 else "" postfix += "batched" if self.batched != 0 else "" return self.name + postfix def plain_name(self): postfix = "Strided" if self.batched == 2 else "" postfix += "Batched" if self.batched != 0 else "" return self.name + postfix def capitalized_name(self): postfix = "Strided" if self.batched == 2 else "" postfix += "Batched" if self.batched != 0 else "" return self.name.capitalize() + postfix def upper_name(self): postfix = "STRIDED" if self.batched == 2 else "" postfix += "BATCHED" if self.batched != 0 else "" return self.name.upper() + postfix def b_star(self): return "*" if self.batched == 1 else "" def b_s(self): return "s" if self.batched == 1 else "" def batch_count_def(self): return ["const size_t batch_count"] if self.batched != 0 else [] def batch_count_list(self): return ["batch_count"] if self.batched != 0 else [] def batch_count_type(self): return ["const size_t"] if self.batched != 0 else [] def batch_count_doc(self): return ["`const size_t batch_count`: Number of batches. This value must be positive."] if self.batched != 0 else [] def batched_transform_to_cpp(self): result = [] for scalar in self.scalars: result.append("auto " + scalar + "s_cpp = std::vector();") for buffer_name in self.inputs + self.outputs: result.append("auto " + buffer_name + "_offsets_cpp = std::vector();") result.append("for (auto batch = size_t{0}; batch < batch_count; ++batch) {") for scalar in self.scalars: result.append(" " + scalar + "s_cpp.push_back(" + scalar + "s[batch]);") for buffer_name in self.inputs + self.outputs: result.append(" " + buffer_name + "_offsets_cpp.push_back(" + buffer_name + "_offsets[batch]);") result.append("}") return result def batched_transform_to_complex(self, flavour): result = [] for scalar in self.scalars: result.append("auto " + scalar + "s_cpp = std::vector<" + flavour.buffer_type + ">();") result.append("for (auto batch = size_t{0}; batch < batch_count; ++batch) {") for scalar in self.scalars: content = scalar if scalar == "alpha": content = flavour.use_alpha(postfix="s[batch]") elif scalar == "beta": content = flavour.use_beta(postfix="s[batch]") result.append(" " + scalar + "s_cpp.push_back(" + content + ");") result.append("}") return result @staticmethod def scalar_buffers_first(): """List of scalar buffers""" return ["dot", "nrm2", "asum", "sum", "imax", "imin"] @staticmethod def scalar_buffers_second(): """List of scalar buffers""" return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"] @staticmethod def scalar_buffers_second_non_pointer(): """As above, but these ones are not passed as pointers but as scalars instead""" return ["sy1"] @staticmethod def other_scalars(): """List of scalars other than alpha and beta""" return ["cos", "sin"] @staticmethod def index_buffers(): """List of buffers with unsigned int type""" return ["imax", "imin"] @staticmethod def postfix(name): """Retrieves the postfix for a buffer""" return "inc" if (name in ["x", "y", "z"]) else "ld" @staticmethod def buffers_vector(): """Distinguish between vectors and matrices""" return ["x", "y", "z"] @staticmethod def buffers_matrix(): """Distinguish between vectors and matrices""" return ["a", "b", "c", "ap"] @staticmethod def buffers_tensor(): """Distinguish between vectors and matrices and tensors""" return ["im", "col", "kernel", "result"] @staticmethod def routines_scalar_no_return(): return ["dotu", "dotc"] @staticmethod def set_size(name, size): """Sets the size of a buffer""" return "const auto " + name + "_size = " + size + ";" @staticmethod def create_buffer(name, template): """Creates a new CLCudaAPI buffer""" return "auto " + name + "_buffer = clblast::Buffer<" + template + ">(context, " + name + "_size);" def write_buffer(self, name, template): """Writes to a CLCudaAPI buffer""" postfix = "" if name in self.scalar_buffers_second_non_pointer(): postfix = "_vec" data_structure = "reinterpret_cast<" + template + "*>(" + name + postfix + ")" return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");" @staticmethod def read_buffer(name, template): """Reads from a CLCudaAPI buffer""" data_structure = "reinterpret_cast<" + template + "*>(" + name + ")" return name + "_buffer.Read(queue, " + name + "_size, " + data_structure + ");" def non_index_inputs(self): """Lists of input/output buffers not index (integer)""" buffers = self.inputs[:] # make a copy for i in self.index_buffers(): if i in buffers: buffers.remove(i) return buffers def non_index_outputs(self): """Lists of input/output buffers not index (integer)""" buffers = self.outputs[:] # make a copy for i in self.index_buffers(): if i in buffers: buffers.remove(i) return buffers def buffers_without_ld_inc(self): """List of buffers without 'inc' or 'ld'""" return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap", "im", "col", "kernel", "result"] def get_buffer_type(self, name, flavour): if name in self.index_buffers(): return "int" return flavour.buffer_type def length(self): """Retrieves the number of characters in the routine's name""" return len(self.capitalized_name()) def no_scalars(self): """Determines whether or not this routine has scalar arguments (alpha/beta)""" return self.scalars == [] or self.name in ["im2col", "col2im", "convgemm"] def has_layout(self): """Determines whether the layout is an argument""" return "layout" in self.options def short_names(self): """Returns the upper-case names of these routines (all flavours)""" return "/".join([f.name + self.upper_name() for f in self.flavours]) def short_names_tested(self): """As above, but excludes some""" names = [f.name + self.upper_name() for f in self.flavours] if "H" + self.upper_name() in names: names.remove("H" + self.upper_name()) return "/".join(names) def buffers_first(self): """Determines which buffers go first (between alpha and beta) and which ones go after""" if self.level == "2b" or self.name == "had": return ["x", "y"] extra_buffer = "col" if self.name == "col2im" else "im" return ["ap", "a", "b", "x", extra_buffer, "kernel"] def buffers_second(self): if self.level == "2b" or self.name == "had": return ["z", "ap", "a", "b", "c"] extra_buffer = "im" if self.name == "col2im" else "col" return ["y", "c", extra_buffer, "result"] def buffer(self, name): """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')""" if name in self.inputs or name in self.outputs: a = [name + "_buffer"] b = [name + "_offset" + self.b_s()] c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] if self.batched == 2: c += [name + "_stride"] return [", ".join(a + b + c)] return [] def buffer_bis(self, name): """As above but with a '_bis' suffix for the buffer name""" if name in self.inputs or name in self.outputs: a = [name + "_buffer_bis"] b = [name + "_offset"] c = [name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] if self.batched == 2: c += [name + "_stride"] return [", ".join(a + b + c)] return [] def buffer_zero_offset(self, name): """As above, but with an offset value of zero""" if name in self.inputs or name in self.outputs: a = [name + "_buffer()"] b = ["0"] c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] return [", ".join(a + b + c)] return [] def buffer_def(self, name): """As above but with data-types""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: a = [prefix + "cl_mem " + name + "_buffer"] b = ["const size_t " + self.b_star() + name + "_offset" + self.b_s()] c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] if self.batched == 2: c += ["const size_t " + name + "_stride"] return [", ".join(a + b + c)] return [] def buffer_def_wrapper_cl(self, name, flavour): """As above but for OpenCL""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: if name == "imax": a = [prefix + "Buffer& " + name + "_buffer"] else: a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"] b = ["const size_t " + name + "_offset"] c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + b + c)] return [] def buffer_def_wrapper_cuda(self, name, flavour): """As above but for CUDA""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: if name == "imax": a = [prefix + "unsigned int * " + name + "_buffer"] else: a = [prefix + flavour.buffer_type + "* " + name + "_buffer"] b = ["const size_t " + name + "_offset"] c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + b + c)] return [] def buffer_def_vector(self, name, flavour): """As above but as vectors""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: if name == "imax": a = [prefix + "std::vector& " + name + "_buffer"] else: a = [prefix + "std::vector<" + flavour.buffer_type + ">& " + name + "_buffer"] b = ["const size_t " + name + "_offset"] c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + b + c)] return [] def buffer_def_pointer(self, name, flavour): """As above but as plain C pointer""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: data_type = "void" if flavour.is_non_standard() else flavour.buffer_type pointer = "" if name in self.scalar_buffers_second_non_pointer() else "*" a = [prefix + data_type + pointer + " " + name + ""] c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + c)] return [] def buffer_clcudaapi(self, name): """As above but with CLCudaAPI buffers""" if name in self.inputs or name in self.outputs: buffer_type = "unsigned int" if (name in self.index_buffers()) else self.template.buffer_type a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"] b = [name + "_offsets_cpp"] if self.batched == 1 else [name + "_offset"] c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] if self.batched == 2: c += [name + "_stride"] return [", ".join(a + b + c)] return [] def buffer_wrapper_clblas(self, name): """As above but with a static cast for clBLAS wrapper""" if name in self.inputs or name in self.outputs: a = [name + "_buffer()"] b = [name + "_offset"] c = [] if name in ["x", "y", "z"]: c = ["static_cast(" + name + "_" + self.postfix(name) + ")"] elif name in ["a", "b", "c"]: c = [name + "_" + self.postfix(name)] return [", ".join(a + b + c)] return [] def buffer_wrapper_cblas(self, name, flavour): """As above but with a static cast for CBLAS wrapper""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: if name == "sy1": a = [name + "_buffer[" + name + "_offset]"] elif flavour.precision_name in ["C", "Z"]: a = ["reinterpret_cast<" + prefix + flavour.buffer_type[:-1] + "*>" + "(&" + name + "_buffer[" + name + "_offset])"] else: a = ["&" + name + "_buffer[" + name + "_offset]"] c = [] if name in ["x", "y", "z", "a", "b", "c"]: c = ["static_cast(" + name + "_" + self.postfix(name) + ")"] return [", ".join(a + c)] return [] def buffer_wrapper_cublas(self, name, flavour): """As above but for cuBLAS the wrapper""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: if name in self.index_buffers(): a = ["reinterpret_cast(&" + name + "_buffer[" + name + "_offset])"] elif name in self.outputs and flavour.name in ["Sc", "Dz"]: dtype = "float" if flavour.name == "Sc" else "double" a = ["reinterpret_cast<" + dtype + "*>(&" + name + "_buffer[" + name + "_offset])"] elif flavour.precision_name in ["C", "Z"]: cuda_complex = "cuDoubleComplex" if flavour.precision_name == "Z" else "cuComplex" a = ["reinterpret_cast<" + prefix + cuda_complex + "*>" + "(&" + name + "_buffer[" + name + "_offset])"] else: a = ["&" + name + "_buffer[" + name + "_offset]"] c = [] if name in ["x", "y", "z"]: c = ["static_cast(" + name + "_" + self.postfix(name) + ")"] elif name in ["a", "b", "c"]: c = [name + "_" + self.postfix(name)] result = [", ".join(a + c)] if self.name == "trmm" and name == "a": result *= 2 return result return [] def buffer_type(self, name): """As above, but only data-types""" prefix = "const " if (name in self.inputs) else "" if (name in self.inputs) or (name in self.outputs): a = [prefix + "cl_mem"] b = ["const size_t" + self.b_star()] c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else [] if self.batched == 2: c += ["const size_t"] return [", ".join(a + b + c)] return [] def buffer_doc(self, name): """Retrieves the documentation of the buffers""" prefix = "const " if (name in self.inputs) else "" inout = "input" if (name in self.inputs) else "output" if (name in self.inputs) or (name in self.outputs): math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " tensor" if (name in self.buffers_tensor()) else name + " vector" inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment " a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."] b = ["`const size_t " + self.b_star() + name + "_offset" + self.b_s() + "`: The offset" + self.b_s() + " in elements from the start of the " + inout + " " + math_name + "."] c = [] if name not in self.buffers_without_ld_inc(): c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " + inc_ld_description + "of the " + inout + " " + math_name + ". This value must be greater than 0."] if self.batched == 2: c += ["`const size_t " + name + "_stride`: The (fixed) stride between two batches of the " + name.upper() + " matrix."] return a + b + c return [] def scalar(self, name): """Retrieves the name of a scalar (alpha/beta)""" if name in self.scalars: if self.batched == 1: return [name + "s_cpp"] return [name] return [] def scalar_cpp(self, name): """As above, but with _cpp as a suffix""" if name in self.scalars: return [name + "_cpp"] return [] def scalar_half_to_float(self, name): """As above, but converts from float to half""" if name in self.scalars: return ["HalfToFloat(" + name + ")"] return [] def scalar_use(self, name, flavour): """Retrieves the use of a scalar (alpha/beta)""" if name in self.scalars: if name == "alpha": if self.batched == 1: return ["alphas_cpp.data()"] return [flavour.use_alpha()] elif name == "beta": if self.batched == 1: return ["betas_cpp.data()"] return [flavour.use_beta()] return [name] return [] def scalar_use_wrapper(self, name, flavour): """As above, but for the clBLAS wrapper""" if name in self.scalars: if name == "alpha": return [flavour.use_alpha_opencl()] elif name == "beta": return [flavour.use_beta_opencl()] return [name] return [] def scalar_use_wrapper_cblas(self, name, flavour): """As above, but for the CBLAS wrapper""" if name in self.scalars: if flavour.is_complex(name): return [name + "_array.data()"] return [name] return [] def scalar_use_wrapper_cublas(self, name, flavour): """As above, but for the cuBLAS wrapper""" if name in self.scalars: if flavour.is_complex(name): return ["&" + name + "_cuda"] return ["&" + name] return [] def scalar_def(self, name, flavour): """Retrieves the definition of a scalar (alpha/beta)""" if name in self.scalars: if name == "alpha": return ["const " + flavour.alpha_cl + " " + self.b_star() + name + self.b_s()] return ["const " + flavour.beta_cl + " " + self.b_star() + name + self.b_s()] return [] def scalar_def_plain(self, name, flavour): """As above, but without 'cl_' prefix""" if name in self.scalars: if name == "alpha": return ["const " + flavour.alpha_cpp + " " + self.b_star() + name + self.b_s()] return ["const " + flavour.beta_cpp + " " + self.b_star() + name + self.b_s()] return [] def scalar_def_void(self, name, flavour): """Retrieves the definition of a scalar (alpha/beta) but make it a void pointer in case of non-standard types""" if name in self.scalars: if name == "alpha": data_type = "void*" if flavour.is_complex("alpha") else flavour.alpha_cpp return ["const " + data_type + " " + name] data_type = "void*" if flavour.is_complex("beta") else flavour.beta_cpp return ["const " + data_type + " " + name] return [] def scalar_type(self, name, flavour): """Retrieves the type of a scalar (alpha/beta)""" if name in self.scalars: if name == "alpha": return ["const " + flavour.alpha_cpp + self.b_star()] return ["const " + flavour.beta_cpp + self.b_star()] return [] def scalar_doc(self, name): """Retrieves the documentation of a scalar""" if name in self.scalars: if name == "alpha": return ["`const " + self.template.alpha_cpp + " " + self.b_star() + name + self.b_s() + "`: Input scalar constant" + self.b_s() + "."] return ["`const " + self.template.beta_cpp + " " + self.b_star() + name + self.b_s() + "`: Input scalar constant" + self.b_s() + "."] return [] def scalar_create_cpp(self, flavour): """Creates a C++ version of a scalar based on a void*""" result = [] for name in self.scalars: if name == "alpha": result.append("const auto alpha_cpp = " + flavour.use_alpha_clblast() + ";") elif name == "beta": result.append("const auto beta_cpp = " + flavour.use_beta_clblast() + ";") return result def sizes_list(self): """Retrieves a list of comma-separated sizes (m, n, k)""" if self.sizes: return [", ".join([s for s in self.sizes])] return [] def sizes_list_as_int(self): """Retrieves a list of comma-separated sizes (m, n, k) cast to integers""" if self.sizes: return [", ".join(["static_cast(" + s + ")" for s in self.sizes])] return [] def sizes_def(self): """Retrieves the definition of the sizes (m,n,k)""" if self.sizes: return [", ".join(["const size_t " + s for s in self.sizes])] return [] def sizes_def_netlib(self): """Retrieves the definition of the sizes (m,n,k) for the CBLAS API""" if self.sizes: return [", ".join(["const int " + s for s in self.sizes])] return [] def sizes_type(self): """Retrieves the types of the sizes (m,n,k)""" if self.sizes: return [", ".join(["const size_t" for s in self.sizes])] return [] def sizes_doc(self): """# Retrieves the documentation of the sizes""" if self.sizes: definitions = ["`const size_t " + s + "`: Integer size argument. This value must be positive." for s in self.sizes] return definitions return [] def options_list(self): """Retrieves a list of options""" if self.options: return [", ".join(self.options)] return [] def options_list_no_layout(self): """Retrieves a list of options""" options = self.options[:] if "layout" in options: options.remove("layout") if options: return [", ".join(options)] return [] def options_cast(self, indent): """As above, but now casted to CLBlast data-types""" if self.options: options = ["static_cast(" + o + ")" for o in self.options] return [(",\n" + indent).join(options)] return [] def options_def(self): """Retrieves the definitions of the options (layout, transpose, side, etc.)""" if self.options: definitions = ["const " + convert.option_to_clblast(o) + " " + o for o in self.options] return [", ".join(definitions)] return [] def options_def_c(self): """As above, but now for the C API""" if self.options: definitions = ["const CLBlast" + convert.option_to_clblast(o) + " " + o for o in self.options] return [", ".join(definitions)] return [] def options_def_wrapper_clblas(self): """As above, but now using clBLAS data-types""" if self.options: definitions = ["const " + convert.option_to_clblas(o) + " " + o for o in self.options] return [", ".join(definitions)] return [] def options_def_wrapper_cblas(self): """As above, but now using CBLAS data-types""" if self.options: definitions = ["const " + convert.option_to_cblas(o) + " " + o for o in self.options] return [", ".join(definitions)] return [] def options_def_wrapper_cublas(self): """As above, but now using cuBLAS data-types""" if self.options: definitions = ["const " + convert.option_to_cublas(o) + " " + o for o in self.options] return [", ".join(definitions)] return [] def options_type(self): """Retrieves the types of the options (layout, transpose, side, etc.)""" if self.options: definitions = ["const " + convert.option_to_clblast(o) for o in self.options] return [", ".join(definitions)] return [] def options_doc(self): """Retrieves the documentation of the options""" if self.options: definitions = ["`const " + convert.option_to_clblast(o) + " " + o + "`: " + convert.option_to_documentation(o) for o in self.options] return definitions return [] def arguments(self): """Retrieves a combination of all the argument names (no types)""" return (self.options_list() + self.sizes_list() + list(chain(*[self.buffer(b) for b in self.scalar_buffers_first()])) + self.scalar("alpha") + list(chain(*[self.buffer(b) for b in self.buffers_first()])) + self.scalar("beta") + list(chain(*[self.buffer(b) for b in self.buffers_second()])) + list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar(s) for s in self.other_scalars()]))) def arguments_half(self): """As above, but with conversions from half to float""" return (self.options_list() + self.sizes_list() + list(chain(*[self.buffer_bis(b) for b in self.scalar_buffers_first()])) + self.scalar_half_to_float("alpha") + list(chain(*[self.buffer_bis(b) for b in self.buffers_first()])) + self.scalar_half_to_float("beta") + list(chain(*[self.buffer_bis(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_bis(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar(s) for s in self.other_scalars()]))) def arguments_clcudaapi(self): """Retrieves a combination of all the argument names, with CLCudaAPI casts""" return (self.options_list() + self.sizes_list() + list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_first()])) + self.scalar("alpha") + list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_first()])) + self.scalar("beta") + list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar(s) for s in self.other_scalars()])) + self.batch_count_list()) def arguments_cast(self, flavour, indent): """As above, but with CLBlast casts""" return (self.options_cast(indent) + self.sizes_list() + list(chain(*[self.buffer(b) for b in self.scalar_buffers_first()])) + self.scalar_use("alpha", flavour) + list(chain(*[self.buffer(b) for b in self.buffers_first()])) + self.scalar_use("beta", flavour) + list(chain(*[self.buffer(b) for b in self.buffers_second()])) + list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])) + self.batch_count_list()) def arguments_netlib(self, flavour, indent): """As above, but for the Netlib CBLAS API""" return (self.options_cast(indent) + self.sizes_list() + list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_first()])) + self.scalar_cpp("alpha") + list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_first()])) + self.scalar_cpp("beta") + list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar(s) for s in self.other_scalars()]))) def arguments_wrapper_clblas(self, flavour): """As above, but for the clBLAS wrapper""" return (self.options_list() + self.sizes_list() + list(chain(*[self.buffer_wrapper_clblas(b) for b in self.scalar_buffers_first()])) + self.scalar_use_wrapper("alpha", flavour) + list(chain(*[self.buffer_wrapper_clblas(b) for b in self.buffers_first()])) + self.scalar_use_wrapper("beta", flavour) + list(chain(*[self.buffer_wrapper_clblas(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_wrapper_clblas(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_use_wrapper(s, flavour) for s in self.other_scalars()]))) def arguments_wrapper_cblas(self, flavour): """As above, but for the CBLAS wrapper""" return (self.options_list() + self.sizes_list_as_int() + self.scalar_use_wrapper_cblas("alpha", flavour) + list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_first()])) + self.scalar_use_wrapper_cblas("beta", flavour) + list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_second()])) + list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_use_wrapper_cblas(s, flavour) for s in self.other_scalars()]))) def arguments_wrapper_cublas(self, flavour): """As above, but for the cuBLAS wrapper""" return (self.options_list_no_layout() + self.sizes_list_as_int() + self.scalar_use_wrapper_cublas("alpha", flavour) + list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.buffers_first()])) + self.scalar_use_wrapper_cublas("beta", flavour) + list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.buffers_second()])) + list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.scalar_buffers_first()])) + list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_use_wrapper_cublas(s, flavour) for s in self.other_scalars()]))) def arguments_def(self, flavour): """Retrieves a combination of all the argument definitions""" return (self.options_def() + self.sizes_def() + list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_first()])) + self.scalar_def("alpha", flavour) + list(chain(*[self.buffer_def(b) for b in self.buffers_first()])) + self.scalar_def("beta", flavour) + list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])) + self.batch_count_def()) def arguments_def_netlib(self, flavour): """As above, but for the Netlib CBLAS API""" result=(self.options_def_c() + self.sizes_def_netlib() + self.scalar_def_void("alpha", flavour) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) + self.scalar_def_void("beta", flavour) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_second()])) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) if self.name in self.routines_scalar_no_return(): result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()])) result += self.batch_count_def() return result def arguments_def_c(self, flavour): """As above, but for the C API""" return (self.options_def_c() + self.sizes_def() + list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_first()])) + self.scalar_def("alpha", flavour) + list(chain(*[self.buffer_def(b) for b in self.buffers_first()])) + self.scalar_def("beta", flavour) + list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])) + self.batch_count_def()) def arguments_def_wrapper_clblas(self, flavour): """As above, but clBLAS wrapper plain data-types""" return (self.options_def_wrapper_clblas() + self.sizes_def() + list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.scalar_buffers_first()])) + self.scalar_def_plain("alpha", flavour) + list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.buffers_first()])) + self.scalar_def_plain("beta", flavour) + list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.buffers_second()])) + list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()]))) def arguments_def_wrapper_cblas(self, flavour): """As above, but CBLAS wrapper plain data-types""" return (self.options_def_wrapper_cblas() + self.sizes_def() + list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_first()])) + self.scalar_def_plain("alpha", flavour) + list(chain(*[self.buffer_def_vector(b, flavour) for b in self.buffers_first()])) + self.scalar_def_plain("beta", flavour) + list(chain(*[self.buffer_def_vector(b, flavour) for b in self.buffers_second()])) + list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()]))) def arguments_def_wrapper_cublas(self, flavour): """As above, but cuBLAS wrapper plain data-types""" return (self.options_def_wrapper_cublas() + self.sizes_def() + list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.scalar_buffers_first()])) + self.scalar_def_plain("alpha", flavour) + list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.buffers_first()])) + self.scalar_def_plain("beta", flavour) + list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.buffers_second()])) + list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()]))) def arguments_type(self, flavour): """Retrieves a combination of all the argument types""" return (self.options_type() + self.sizes_type() + list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_first()])) + self.scalar_type("alpha", flavour) + list(chain(*[self.buffer_type(b) for b in self.buffers_first()])) + self.scalar_type("beta", flavour) + list(chain(*[self.buffer_type(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()])) + self.batch_count_type()) def arguments_doc(self): """Retrieves a combination of all the argument types""" return (self.options_doc() + self.sizes_doc() + list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_first()])) + self.scalar_doc("alpha") + list(chain(*[self.buffer_doc(b) for b in self.buffers_first()])) + self.scalar_doc("beta") + list(chain(*[self.buffer_doc(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_doc(s) for s in self.other_scalars()])) + self.batch_count_doc()) def arguments_python(self): """Arguments for the Python wrapper pyclblast""" result = list() result.extend(self.sizes) if self.batched == 2: # strided batched result.append("batch_count") buffers = self.inputs + self.outputs result.extend(buffers[:]) if self.batched != 1: # regular or strided-batched for buf in buffers: if buf in self.buffers_matrix(): result.append(buf + "_ld") for buf in buffers: if buf in self.buffers_vector(): result.append(buf + "_inc = 1") if self.batched == 2: # strided batched for buf in buffers: if buf in self.buffers_matrix(): result.append(buf + "_stride") for scalar in self.scalars: if scalar != "": default = "1.0" if scalar == "alpha" else "0.0" result.append(scalar + " = " + default) else: # batched but not strided-batched for scalar in self.scalars: result.append(scalar + "s") for buf in buffers: if buf in self.buffers_matrix(): result.append(buf + "_ld") for buf in buffers: if buf in self.buffers_vector() + self.buffers_matrix(): result.append(buf + "_offsets") for buf in buffers: if buf in self.buffers_vector(): result.append(buf + "_inc = 1") for option in self.options: if option == "a_transpose": result.append("a_transp = False") if option == "b_transpose": result.append("b_transp = False") if option == "ab_transpose": result.append("ab_transp = False") if option == "side": result.append("right_side = False") if option == "triangle": result.append("lower_triangle = False") if option == "diagonal": result.append("unit_diagonal = False") if self.batched != 1: for buf in buffers: result.append(buf + "_offset = 0") return result def requirements_doc(self): """Retrieves a list of routine requirements for documentation""" return self.requirements def routine_header_cpp(self, spaces, default_event, cuda=False, implementation=False): """Retrieves the C++ templated definition for a routine""" indent = " " * (spaces + self.length()) arguments = self.arguments_def(self.template) mem_type = "cl_mem" if cuda: arguments = [a.replace(mem_type, "CUdeviceptr") for a in arguments] mem_type = "CUdeviceptr" result = "template <" + self.template.name + ">\n" result += "StatusCode " + self.capitalized_name() + "(" result += (",\n" + indent).join([a for a in arguments]) result += ",\n" + indent if cuda: result += "const CUcontext context, const CUdevice device" else: result += "cl_command_queue* queue, cl_event* event" + default_event if self.temp_buffer: result += ",\n" + indent + mem_type + " temp_buffer" if not implementation: result += " = 0" if cuda else " = nullptr" result += ")" return result def routine_header_type_cpp(self, spaces, cuda=False): """As above, but now without variable names""" indent = " " * (spaces + self.length()) arguments = self.arguments_type(self.template) if cuda: arguments = [a.replace("cl_mem", "CUdeviceptr") for a in arguments] result = "template <" + self.template.name + ">\n" result += "StatusCode " + self.capitalized_name() + "(" result += (",\n" + indent).join([a for a in arguments]) result += ",\n" + indent if cuda: result += "const CUcontext, const CUdevice" else: result += "cl_command_queue*, cl_event*" result += ")" return result def routine_header_c(self, flavour, spaces, extra_qualifier): """As above, but now for C""" indent = " " * (spaces + self.length()) result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.plain_name() + "(" result += (",\n" + indent).join([a for a in self.arguments_def_c(flavour)]) result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)" return result def routine_header_netlib(self, flavour, spaces, extra_qualifier): """As above, but now for the original Netlib CBLAS API""" return_type = "void" for output in self.outputs: if output in self.index_buffers(): return_type = "int" break if output in self.scalar_buffers_first() and self.name not in self.routines_scalar_no_return(): return_type = flavour.buffer_type.replace("2", "") break indent = " " * (spaces + len(return_type) + self.length()) routine_name = self.name if self.name in self.routines_scalar_no_return(): routine_name += "_sub" indent += " " if self.batched != 0: routine_name += "batched" result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "(" result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")" return result def routine_header_wrapper_clblas(self, flavour, def_only, spaces): """As above, but now for the clBLAS wrapper""" template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else "" indent = " " * (spaces + self.length() + len(template)) result = "" if self.no_scalars(): result += "template <" if def_only: result += flavour.name result += ">\n" result += "clblasStatus clblasX" + self.name + template + "(" result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_clblas(flavour)]) result += ",\n" + indent + "cl_uint num_queues, cl_command_queue *queues" result += ",\n" + indent + "cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)" return result def routine_header_wrapper_cblas(self, flavour, spaces): """As above, but now for the CBLAS wrapper""" indent = " " * (spaces + self.length()) result = "void cblasX" + self.name + "(" result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_cblas(flavour)]) + ")" return result def routine_header_wrapper_cublas(self, flavour, def_only, spaces): """As above, but now for the cuBLAS wrapper""" template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else "" indent = " " * (spaces + self.length() + len(template)) result = "" if self.no_scalars(): result += "template <" if def_only: result += flavour.name result += ">\n" result += "cublasStatus_t cublasX" + self.name + template + "(cublasHandle_t handle, " result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_cublas(flavour)]) + ")" return result CLBlast-1.6.3/src/000077500000000000000000000000001463263031500135775ustar00rootroot00000000000000CLBlast-1.6.3/src/api_common.cpp000066400000000000000000000203041463263031500164230ustar00rootroot00000000000000// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the common (non-OpenCL-specific) functions of the CLBlast API. // // ================================================================================================= #include #include "utilities/utilities.hpp" #include "cache.hpp" #include "routines/routines.hpp" namespace clblast { // ================================================================================================= // Clears the cache of stored binaries StatusCode ClearCache() { try { ProgramCache::Instance().Invalidate(); BinaryCache::Instance().Invalidate(); } catch (...) { return DispatchException(); } return StatusCode::kSuccess; } template void FillCacheForPrecision(Queue &queue) { try { // Runs all the level 1 set-up functions Xswap(queue, nullptr); Xswap(queue, nullptr); Xswap(queue, nullptr); Xswap(queue, nullptr); Xscal(queue, nullptr); Xscal(queue, nullptr); Xcopy(queue, nullptr); Xcopy(queue, nullptr); Xaxpy(queue, nullptr); Xaxpy(queue, nullptr); Xdot(queue, nullptr); Xdotu(queue, nullptr); Xdotc(queue, nullptr); Xnrm2(queue, nullptr); Xnrm2(queue, nullptr); Xasum(queue, nullptr); Xasum(queue, nullptr); Xsum(queue, nullptr); Xsum(queue, nullptr); Xamax(queue, nullptr); Xamax(queue, nullptr); Xmax(queue, nullptr); Xmax(queue, nullptr); Xmin(queue, nullptr); Xmin(queue, nullptr); // Runs all the level 2 set-up functions Xgemv(queue, nullptr); Xgemv(queue, nullptr); Xgbmv(queue, nullptr); Xgbmv(queue, nullptr); Xhemv(queue, nullptr); Xhbmv(queue, nullptr); Xhpmv(queue, nullptr); Xsymv(queue, nullptr); Xsbmv(queue, nullptr); Xspmv(queue, nullptr); Xtrmv(queue, nullptr); Xtrmv(queue, nullptr); Xtbmv(queue, nullptr); Xtbmv(queue, nullptr); Xtpmv(queue, nullptr); Xtpmv(queue, nullptr); Xger(queue, nullptr); Xgeru(queue, nullptr); Xgerc(queue, nullptr); Xher(queue, nullptr); Xhpr(queue, nullptr); Xher2(queue, nullptr); Xhpr2(queue, nullptr); Xsyr(queue, nullptr); Xspr(queue, nullptr); Xsyr2(queue, nullptr); Xspr2(queue, nullptr); // Runs all the level 3 set-up functions Xgemm(queue, nullptr); Xgemm(queue, nullptr); Xsymm(queue, nullptr); Xsymm(queue, nullptr); Xhemm(queue, nullptr); Xsyrk(queue, nullptr); Xsyrk(queue, nullptr); Xherk(queue, nullptr); Xsyr2k(queue, nullptr); Xsyr2k(queue, nullptr); Xher2k(queue, nullptr); Xtrmm(queue, nullptr); Xtrmm(queue, nullptr); // Runs all the non-BLAS set-up functions Xomatcopy(queue, nullptr); Xomatcopy(queue, nullptr); } catch(const RuntimeErrorCode &e) { if (e.status() != StatusCode::kNoDoublePrecision && e.status() != StatusCode::kNoHalfPrecision) { throw; } } } // Fills the cache with all binaries for a specific device // TODO: Add half-precision FP16 set-up calls StatusCode FillCache(const RawDeviceID device) { try { // Creates a sample context and queue to match the normal routine calling conventions auto device_cpp = Device(device); auto context = Context(device_cpp); auto queue = Queue(context, device_cpp); FillCacheForPrecision(queue); FillCacheForPrecision(queue); } catch (...) { return DispatchException(); } return StatusCode::kSuccess; } // ================================================================================================= // Retrieves the current tuning parameters for this device-precision-kernel combination StatusCode RetrieveParameters(const RawDeviceID device, const std::string &kernel_name, const Precision precision, std::unordered_map ¶meters) { try { // Retrieves the device name const auto device_cpp = Device(device); const auto platform_id = device_cpp.PlatformID(); const auto device_name = GetDeviceName(device_cpp); // Retrieves the database values auto in_cache = false; auto database = DatabaseCache::Instance().Get(DatabaseKeyRef{platform_id, device, precision, kernel_name}, &in_cache); if (!in_cache) { log_debug("Searching database for kernel '" + kernel_name + "'"); database = Database(device_cpp, kernel_name, precision, {}); } // Retrieves the parameters for (const auto ¶meter: database.GetParameters()) { parameters[parameter.first] = parameter.second; } } catch (...) { return DispatchException(); } return StatusCode::kSuccess; } // Overrides the tuning parameters for this device-precision-kernel combination StatusCode OverrideParameters(const RawDeviceID device, const std::string &kernel_name, const Precision precision, const std::unordered_map ¶meters) { try { // Retrieves the device name const auto device_cpp = Device(device); const auto platform_id = device_cpp.PlatformID(); const auto device_name = GetDeviceName(device_cpp); // Retrieves the current database values to verify whether the new ones are complete auto in_cache = false; auto current_database = DatabaseCache::Instance().Get(DatabaseKeyRef{platform_id, device, precision, kernel_name}, &in_cache); if (!in_cache) { log_debug("Searching database for kernel '" + kernel_name + "'"); current_database = Database(device_cpp, kernel_name, precision, {}); } // Verifies the parameters size const auto current_parameter_names = current_database.GetParameterNames(); if (current_parameter_names.size() > parameters.size()) { return StatusCode::kMissingOverrideParameter; } // Retrieves the names and values separately and in the same order as the existing database auto parameter_values = database::Params{0}; auto i = size_t{0}; for (const auto ¤t_param : current_parameter_names) { if (parameters.find(current_param) == parameters.end()) { return StatusCode::kMissingOverrideParameter; } const auto parameter_value = parameters.at(current_param); parameter_values[i] = parameter_value; ++i; } // Creates a small custom database based on the provided parameters const auto database_device = database::DatabaseDevice{database::kDeviceNameDefault, parameter_values}; const auto database_architecture = database::DatabaseArchitecture{"default", {database_device}}; const auto database_vendor = database::DatabaseVendor{database::kDeviceTypeAll, "default", {database_architecture}}; const auto database_entry = database::DatabaseEntry{kernel_name, precision, current_parameter_names, {database_vendor}}; const auto database_entries = std::vector{database_entry}; const auto database = Database(device_cpp, kernel_name, precision, database_entries); // Removes the old database entry and stores the new one in the cache DatabaseCache::Instance().Remove(DatabaseKey{platform_id, device, precision, kernel_name}); DatabaseCache::Instance().Store(DatabaseKey{platform_id, device, precision, kernel_name}, Database(database)); } catch (...) { return DispatchException(); } return StatusCode::kSuccess; } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/cache.cpp000066400000000000000000000103041463263031500153440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the caching functionality of compiled binaries and programs. // // ================================================================================================= #include #include #include #include "database/database.hpp" #include "cache.hpp" namespace clblast { // ================================================================================================= template template Value Cache::Get(const U &key, bool *in_cache) const { std::lock_guard lock(cache_mutex_); #if __cplusplus >= 201402L // generalized std::map::find() of C++14 auto it = cache_.find(key); #else // O(n) lookup in a vector auto it = std::find_if(cache_.begin(), cache_.end(), [&] (const std::pair &pair) { return pair.first == key; }); #endif if (it == cache_.end()) { if (in_cache) { *in_cache = false; } return Value(); } if (in_cache) { *in_cache = true; } return it->second; } template void Cache::Store(Key &&key, Value &&value) { std::lock_guard lock(cache_mutex_); #if __cplusplus >= 201402L // emplace() into a map auto r = cache_.emplace(std::move(key), std::move(value)); if (!r.second) { // The object is already in cache. This can happen if two threads both // checked the cache for an object, both found that it isn't there, then // both produced the object (e.g. a compiled binary) and try to store it // in the cache. The first one will succeed normally, the second one will // hit this point. We simply return in this case. return; } #else // emplace_back() into a vector cache_.emplace_back(std::move(key), std::move(value)); #endif } template void Cache::Remove(const Key &key) { std::lock_guard lock(cache_mutex_); #if __cplusplus >= 201402L cache_.erase(key); #else auto it = cache_.begin(); while (it != cache_.end()) { if ((*it).first == key) { it = cache_.erase(it); } else ++it; } #endif } template template void Cache::RemoveBySubset(const Key &key) { std::lock_guard lock(cache_mutex_); auto it = cache_.begin(); while (it != cache_.end()) { const auto current_key = (*it).first; if ((std::get(key) == std::get(current_key)) && (std::get(key) == std::get(current_key))) { it = cache_.erase(it); } else ++it; } } template void Cache::Invalidate() { std::lock_guard lock(cache_mutex_); cache_.clear(); } template Cache &Cache::Instance() { return instance_; } template Cache Cache::instance_; // ================================================================================================= template class Cache; template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const; // ================================================================================================= template class Cache>; template std::shared_ptr ProgramCache::Get(const ProgramKeyRef &, bool *) const; template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey &); // precision and routine name // ================================================================================================= template class Cache; template Database DatabaseCache::Get(const DatabaseKeyRef &, bool *) const; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/cache.hpp000066400000000000000000000112661463263031500153610ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the caching functionality of compiled binaries and programs. // // ================================================================================================= #ifndef CLBLAST_CACHE_H_ #define CLBLAST_CACHE_H_ #include #include #include #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= // The generic thread-safe cache. We assume that the Key may be a heavyweight struct that is not // normally used by the caller, while the Value is either lightweight or ref-counted. // Hence, searching by non-Key is supported (if there is a corresponding operator<()), and // on Store() the Key instance is moved from the caller (because it will likely be constructed // as temporary at the time of Store()). template class Cache { public: // Cached object is returned by-value to avoid racing with Invalidate(). // Due to lack of std::optional<>, in case of a cache miss we return a default-constructed // Value and set the flag to false. template Value Get(const U &key, bool *in_cache) const; // We do not return references to just stored object to avoid racing with Invalidate(). // Caller is expected to store a temporary. void Store(Key &&key, Value &&value); void Invalidate(); // Removes all entries with a given key void Remove(const Key &key); template void RemoveBySubset(const Key &key); // currently supports 2 indices static Cache &Instance(); private: #if __cplusplus >= 201402L // The std::less allows to search in cache by an object comparable with Key, without // constructing a temporary Key // (see http://en.cppreference.com/w/cpp/utility/functional/less_void, // http://www.open-std.org/JTC1/SC22/WG21/docs/papers/2013/n3657.htm, // http://stackoverflow.com/questions/10536788/avoiding-key-construction-for-stdmapfind) std::map> cache_; #else std::vector> cache_; #endif mutable std::mutex cache_mutex_; static Cache instance_; }; // class Cache // ================================================================================================= // The key struct for the cache of compiled OpenCL binaries (device name and platform-dependent) // Order of fields: precision, routine_name, device_name (smaller fields first) typedef std::tuple BinaryKey; typedef std::tuple BinaryKeyRef; typedef Cache BinaryCache; extern template class Cache; extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const; // ================================================================================================= // The key struct for the cache of compiled OpenCL programs (context-dependent) // Order of fields: context, device_id, precision, routine_name (smaller fields first) typedef std::tuple ProgramKey; typedef std::tuple ProgramKeyRef; typedef Cache> ProgramCache; extern template class Cache>; extern template std::shared_ptr ProgramCache::Get(const ProgramKeyRef &, bool *) const; // ================================================================================================= class Database; // The key struct for the cache of database maps. // Order of fields: platform_id, device_id, precision, kernel_name (smaller fields first) typedef std::tuple DatabaseKey; typedef std::tuple DatabaseKeyRef; typedef Cache DatabaseCache; extern template class Cache; extern template Database DatabaseCache::Get(const DatabaseKeyRef &, bool *) const; // ================================================================================================= } // namespace clblast // CLBLAST_CACHE_H_ #endif CLBlast-1.6.3/src/clblast.cpp000066400000000000000000004655231463263031500157460ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements all the BLAS API calls. In all cases, it does not much more than creating // a new object of the appropriate type, and calling the main routine on that object. It forwards // all status codes to the caller. // // ================================================================================================= #include #include "routines/routines.hpp" #include "clblast.h" namespace clblast { // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // Generate givens plane rotation: SROTG/DROTG template StatusCode Rotg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Generate modified givens plane rotation: SROTMG/DROTMG template StatusCode Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Apply givens plane rotation: SROT/DROT template StatusCode Rot(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, const T, const T, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rot(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, const float, const float, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rot(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, const double, const double, cl_command_queue*, cl_event*); // Apply modified givens plane rotation: SROTM/DROTM template StatusCode Rotm(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotm(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rotm(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xswap(queue_cpp, event); routine.DoSwap(n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template StatusCode Scal(const size_t n, const T alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xscal(queue_cpp, event); routine.DoScal(n, alpha, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Scal(const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Scal(const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Scal(const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Scal(const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Scal(const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xcopy(queue_cpp, event); routine.DoCopy(n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template StatusCode Axpy(const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xaxpy(queue_cpp, event); routine.DoAxpy(n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Axpy(const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Axpy(const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Axpy(const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Axpy(const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Axpy(const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Dot product of two vectors: SDOT/DDOT/HDOT template StatusCode Dot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xdot(queue_cpp, event); routine.DoDot(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dot(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Dot(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Dot(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Dot product of two complex vectors: CDOTU/ZDOTU template StatusCode Dotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xdotu(queue_cpp, event); routine.DoDotu(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dotu(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Dotu(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template StatusCode Dotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xdotc(queue_cpp, event); routine.DoDotc(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dotc(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Dotc(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template StatusCode Nrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xnrm2(queue_cpp, event); routine.DoNrm2(n, Buffer(nrm2_buffer), nrm2_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template StatusCode Asum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xasum(queue_cpp, event); routine.DoAsum(n, Buffer(asum_buffer), asum_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template StatusCode Sum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsum(queue_cpp, event); routine.DoSum(n, Buffer(sum_buffer), sum_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xamax(queue_cpp, event); routine.DoAmax(n, Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN template StatusCode Amin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xamin(queue_cpp, event); routine.DoAmin(n, Buffer(imin_buffer), imin_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Amin(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amin(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amin(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amin(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amin(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xmax(queue_cpp, event); routine.DoMax(n, Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template StatusCode Min(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xmin(queue_cpp, event); routine.DoMin(n, Buffer(imin_buffer), imin_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgemv(queue_cpp, event); routine.DoGemv(layout, a_transpose, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgbmv(queue_cpp, event); routine.DoGbmv(layout, a_transpose, m, n, kl, ku, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template StatusCode Hemv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhemv(queue_cpp, event); routine.DoHemv(layout, triangle, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template StatusCode Hbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhbmv(queue_cpp, event); routine.DoHbmv(layout, triangle, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template StatusCode Hpmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhpmv(queue_cpp, event); routine.DoHpmv(layout, triangle, n, alpha, Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const size_t, const float2, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const size_t, const double2, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsymv(queue_cpp, event); routine.DoSymv(layout, triangle, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template StatusCode Sbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsbmv(queue_cpp, event); routine.DoSbmv(layout, triangle, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template StatusCode Spmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xspmv(queue_cpp, event); routine.DoSpmv(layout, triangle, n, alpha, Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtrmv(queue_cpp, event); routine.DoTrmv(layout, triangle, a_transpose, diagonal, n, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtbmv(queue_cpp, event); routine.DoTbmv(layout, triangle, a_transpose, diagonal, n, k, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtpmv(queue_cpp, event); routine.DoTpmv(layout, triangle, a_transpose, diagonal, n, Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtrsv(queue_cpp, event); routine.DoTrsv(layout, triangle, a_transpose, diagonal, n, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // General rank-1 matrix update: SGER/DGER/HGER template StatusCode Ger(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xger(queue_cpp, event); routine.DoGer(layout, m, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // General rank-1 complex matrix update: CGERU/ZGERU template StatusCode Geru(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgeru(queue_cpp, event); routine.DoGeru(layout, m, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Geru(const Layout, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Geru(const Layout, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // General rank-1 complex conjugated matrix update: CGERC/ZGERC template StatusCode Gerc(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgerc(queue_cpp, event); routine.DoGerc(layout, m, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gerc(const Layout, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gerc(const Layout, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian rank-1 matrix update: CHER/ZHER template StatusCode Her(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xher,T>(queue_cpp, event); routine.DoHer(layout, triangle, n, alpha, Buffer>(x_buffer), x_offset, x_inc, Buffer>(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Her(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian packed rank-1 matrix update: CHPR/ZHPR template StatusCode Hpr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhpr,T>(queue_cpp, event); routine.DoHpr(layout, triangle, n, alpha, Buffer>(x_buffer), x_offset, x_inc, Buffer>(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Hermitian rank-2 matrix update: CHER2/ZHER2 template StatusCode Her2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xher2(queue_cpp, event); routine.DoHer2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her2(const Layout, const Triangle, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Her2(const Layout, const Triangle, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 template StatusCode Hpr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhpr2(queue_cpp, event); routine.DoHpr2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template StatusCode Syr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsyr(queue_cpp, event); routine.DoSyr(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template StatusCode Spr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xspr(queue_cpp, event); routine.DoSpr(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template StatusCode Syr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsyr2(queue_cpp, event); routine.DoSyr2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template StatusCode Spr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xspr2(queue_cpp, event); routine.DoSpr2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer) { try { auto queue_cpp = Queue(*queue); auto routine = Xgemm(queue_cpp, event); const auto temp_buffer_provided = temp_buffer != nullptr; auto temp_buffer_cpp = temp_buffer_provided ? Buffer(temp_buffer) : Buffer(nullptr); routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld, temp_buffer_cpp, temp_buffer_provided); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*, cl_mem); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*, cl_mem); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*, cl_mem); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*, cl_mem); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*, cl_mem); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsymm(queue_cpp, event); routine.DoSymm(layout, side, triangle, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhemm(queue_cpp, event); routine.DoHemm(layout, side, triangle, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsyrk(queue_cpp, event); routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Rank-K update of a hermitian matrix: CHERK/ZHERK template StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xherk,T>(queue_cpp, event); routine.DoHerk(layout, triangle, a_transpose, n, k, alpha, Buffer>(a_buffer), a_offset, a_ld, beta, Buffer>(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsyr2k(queue_cpp, event); routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xher2k(queue_cpp, event); routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtrmm(queue_cpp, event); routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtrsm(queue_cpp, event); routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= // Extra non-BLAS routines (level-X) // ================================================================================================= // Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD template StatusCode Had(const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const T beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhad(queue_cpp, event); routine.DoHad(n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, beta, Buffer(z_buffer), z_offset, z_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Had(const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Had(const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Had(const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Had(const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Had(const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY template StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xomatcopy(queue_cpp, event); routine.DoOmatcopy(layout, a_transpose, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL template StatusCode Im2col(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xim2col(queue_cpp, event); routine.DoIm2col(kernel_mode, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, Buffer(im_buffer), im_offset, Buffer(col_buffer), col_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM template StatusCode Col2im(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xcol2im(queue_cpp, event); routine.DoCol2im(kernel_mode, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, Buffer(col_buffer), col_offset, Buffer(im_buffer), im_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM template StatusCode Convgemm(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xconvgemm(queue_cpp, event); routine.DoConvgemm(kernel_mode, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count, Buffer(im_buffer), im_offset, Buffer(kernel_buffer), kernel_offset, Buffer(result_buffer), result_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Convgemm(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Convgemm(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Convgemm(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template StatusCode AxpyBatched(const size_t n, const T *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = XaxpyBatched(queue_cpp, event); auto alphas_cpp = std::vector(); auto x_offsets_cpp = std::vector(); auto y_offsets_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); x_offsets_cpp.push_back(x_offsets[batch]); y_offsets_cpp.push_back(y_offsets[batch]); } routine.DoAxpyBatched(n, alphas_cpp, Buffer(x_buffer), x_offsets_cpp, x_inc, Buffer(y_buffer), y_offsets_cpp, y_inc, batch_count); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API AxpyBatched(const size_t, const float*, const cl_mem, const size_t*, const size_t, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double*, const cl_mem, const size_t*, const size_t, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const float2*, const cl_mem, const size_t*, const size_t, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double2*, const cl_mem, const size_t*, const size_t, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const half*, const cl_mem, const size_t*, const size_t, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED template StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const T *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = XgemmBatched(queue_cpp, event); auto alphas_cpp = std::vector(); auto betas_cpp = std::vector(); auto a_offsets_cpp = std::vector(); auto b_offsets_cpp = std::vector(); auto c_offsets_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); betas_cpp.push_back(betas[batch]); a_offsets_cpp.push_back(a_offsets[batch]); b_offsets_cpp.push_back(b_offsets[batch]); c_offsets_cpp.push_back(c_offsets[batch]); } routine.DoGemmBatched(layout, a_transpose, b_transpose, m, n, k, alphas_cpp, Buffer(a_buffer), a_offsets_cpp, a_ld, Buffer(b_buffer), b_offsets_cpp, b_ld, betas_cpp, Buffer(c_buffer), c_offsets_cpp, c_ld, batch_count); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float*, const cl_mem, const size_t*, const size_t, const cl_mem, const size_t*, const size_t, const float*, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double*, const cl_mem, const size_t*, const size_t, const cl_mem, const size_t*, const size_t, const double*, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2*, const cl_mem, const size_t*, const size_t, const cl_mem, const size_t*, const size_t, const float2*, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2*, const cl_mem, const size_t*, const size_t, const cl_mem, const size_t*, const size_t, const double2*, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half*, const cl_mem, const size_t*, const size_t, const cl_mem, const size_t*, const size_t, const half*, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); // StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED template StatusCode GemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = XgemmStridedBatched(queue_cpp, event); routine.DoGemmStridedBatched(layout, a_transpose, b_transpose, m, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, a_stride, Buffer(b_buffer), b_offset, b_ld, b_stride, beta, Buffer(c_buffer), c_offset, c_ld, c_stride, batch_count); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const size_t, const cl_mem, const size_t, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const size_t, const cl_mem, const size_t, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const size_t, const cl_mem, const size_t, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const size_t, const cl_mem, const size_t, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const size_t, const cl_mem, const size_t, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= // Retrieves the required size of the temporary buffer for the GEMM kernel (optional) template StatusCode GemmTempBufferSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t& temp_buffer_size) { try { // Retrieves the tuning database const auto queue_cpp = Queue(*queue); const auto device = queue_cpp.GetDevice(); const auto kernel_names = std::vector{"Xgemm", "GemmRoutine"}; Databases db(kernel_names); Routine::InitDatabase(device, kernel_names, PrecisionValue(), {}, db); // Computes the buffer size if (Xgemm::UseDirectKernel(m, n, k, db["XGEMM_MIN_INDIRECT_SIZE"])) { temp_buffer_size = 0; } else { temp_buffer_size = Xgemm::GetTempSize(layout, a_transpose, b_transpose, m, n, k, a_offset, a_ld, b_offset, b_ld, c_offset, c_ld, db["MWG"], db["NWG"], db["KWG"] * db["KREG"], db["GEMMK"]); } temp_buffer_size *= sizeof(T); // translate from num-elements to bytes return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, size_t&); template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, size_t&); template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, size_t&); template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, size_t&); template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, size_t&); // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/clblast_c.cpp000066400000000000000000007224251463263031500162450ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements all the plain C BLAS API calls. This forwards the calls to the C++ API. // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "clblast_c.h" #include "clblast.h" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // ROTG CLBlastStatusCode CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset, cl_mem sb_buffer, const size_t sb_offset, cl_mem sc_buffer, const size_t sc_offset, cl_mem ss_buffer, const size_t ss_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Rotg(sa_buffer, sa_offset, sb_buffer, sb_offset, sc_buffer, sc_offset, ss_buffer, ss_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, cl_mem sb_buffer, const size_t sb_offset, cl_mem sc_buffer, const size_t sc_offset, cl_mem ss_buffer, const size_t ss_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Rotg(sa_buffer, sa_offset, sb_buffer, sb_offset, sc_buffer, sc_offset, ss_buffer, ss_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // ROTMG CLBlastStatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Rotmg(sd1_buffer, sd1_offset, sd2_buffer, sd2_offset, sx1_buffer, sx1_offset, sy1_buffer, sy1_offset, sparam_buffer, sparam_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Rotmg(sd1_buffer, sd1_offset, sd2_buffer, sd2_offset, sx1_buffer, sx1_offset, sy1_buffer, sy1_offset, sparam_buffer, sparam_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // ROT CLBlastStatusCode CLBlastSrot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const float cos, const float sin, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Rot(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, cos, sin, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDrot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const double cos, const double sin, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Rot(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, cos, sin, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // ROTM CLBlastStatusCode CLBlastSrotm(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Rotm(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, sparam_buffer, sparam_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDrotm(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Rotm(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, sparam_buffer, sparam_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SWAP CLBlastStatusCode CLBlastSswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SCAL CLBlastStatusCode CLBlastSscal(const size_t n, const float alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Scal(n, alpha, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDscal(const size_t n, const double alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Scal(n, alpha, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCscal(const size_t n, const cl_float2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Scal(n, float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZscal(const size_t n, const cl_double2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Scal(n, double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHscal(const size_t n, const cl_half alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Scal(n, alpha, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // COPY CLBlastStatusCode CLBlastScopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // AXPY CLBlastStatusCode CLBlastSaxpy(const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Axpy(n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDaxpy(const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Axpy(n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCaxpy(const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Axpy(n, float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZaxpy(const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Axpy(n, double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHaxpy(const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Axpy(n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // DOT CLBlastStatusCode CLBlastSdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Dot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Dot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Dot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // DOTU CLBlastStatusCode CLBlastCdotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Dotu(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZdotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Dotu(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // DOTC CLBlastStatusCode CLBlastCdotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Dotc(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZdotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Dotc(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // NRM2 CLBlastStatusCode CLBlastSnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Nrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Nrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastScnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Nrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDznrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Nrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Nrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // ASUM CLBlastStatusCode CLBlastSasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Asum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Asum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastScasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Asum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDzasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Asum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Asum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SUM CLBlastStatusCode CLBlastSsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Sum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Sum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastScsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Sum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDzsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Sum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Sum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // AMAX CLBlastStatusCode CLBlastiSamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Amax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiDamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Amax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiCamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Amax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiZamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Amax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiHamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Amax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // AMIN CLBlastStatusCode CLBlastiSamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Amin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiDamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Amin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiCamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Amin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiZamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Amin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiHamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Amin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // MAX CLBlastStatusCode CLBlastiSmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Max(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiDmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Max(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiCmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Max(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiZmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Max(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiHmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Max(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // MIN CLBlastStatusCode CLBlastiSmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Min(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiDmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Min(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiCmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Min(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiZmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Min(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastiHmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Min(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // GEMV CLBlastStatusCode CLBlastSgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, float2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, double2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // GBMV CLBlastStatusCode CLBlastSgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, float2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, double2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // HEMV CLBlastStatusCode CLBlastChemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Hemv(static_cast(layout), static_cast(triangle), n, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, float2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Hemv(static_cast(layout), static_cast(triangle), n, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, double2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // HBMV CLBlastStatusCode CLBlastChbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Hbmv(static_cast(layout), static_cast(triangle), n, k, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, float2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Hbmv(static_cast(layout), static_cast(triangle), n, k, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, double2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // HPMV CLBlastStatusCode CLBlastChpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Hpmv(static_cast(layout), static_cast(triangle), n, float2{alpha.s[0], alpha.s[1]}, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, float2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Hpmv(static_cast(layout), static_cast(triangle), n, double2{alpha.s[0], alpha.s[1]}, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, double2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SYMV CLBlastStatusCode CLBlastSsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Symv(static_cast(layout), static_cast(triangle), n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Symv(static_cast(layout), static_cast(triangle), n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Symv(static_cast(layout), static_cast(triangle), n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SBMV CLBlastStatusCode CLBlastSsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Sbmv(static_cast(layout), static_cast(triangle), n, k, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Sbmv(static_cast(layout), static_cast(triangle), n, k, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Sbmv(static_cast(layout), static_cast(triangle), n, k, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SPMV CLBlastStatusCode CLBlastSspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Spmv(static_cast(layout), static_cast(triangle), n, alpha, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Spmv(static_cast(layout), static_cast(triangle), n, alpha, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Spmv(static_cast(layout), static_cast(triangle), n, alpha, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // TRMV CLBlastStatusCode CLBlastStrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // TBMV CLBlastStatusCode CLBlastStbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // TPMV CLBlastStatusCode CLBlastStpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // TRSV CLBlastStatusCode CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // TBSV CLBlastStatusCode CLBlastStbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // TPSV CLBlastStatusCode CLBlastStpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // GER CLBlastStatusCode CLBlastSger(const CLBlastLayout layout, const size_t m, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Ger(static_cast(layout), m, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDger(const CLBlastLayout layout, const size_t m, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Ger(static_cast(layout), m, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHger(const CLBlastLayout layout, const size_t m, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Ger(static_cast(layout), m, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // GERU CLBlastStatusCode CLBlastCgeru(const CLBlastLayout layout, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Geru(static_cast(layout), m, n, float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZgeru(const CLBlastLayout layout, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Geru(static_cast(layout), m, n, double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // GERC CLBlastStatusCode CLBlastCgerc(const CLBlastLayout layout, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gerc(static_cast(layout), m, n, float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZgerc(const CLBlastLayout layout, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gerc(static_cast(layout), m, n, double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // HER CLBlastStatusCode CLBlastCher(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Her(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZher(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Her(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // HPR CLBlastStatusCode CLBlastChpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Hpr(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Hpr(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // HER2 CLBlastStatusCode CLBlastCher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Her2(static_cast(layout), static_cast(triangle), n, float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Her2(static_cast(layout), static_cast(triangle), n, double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // HPR2 CLBlastStatusCode CLBlastChpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Hpr2(static_cast(layout), static_cast(triangle), n, float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Hpr2(static_cast(layout), static_cast(triangle), n, double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SYR CLBlastStatusCode CLBlastSsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syr(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syr(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syr(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SPR CLBlastStatusCode CLBlastSspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Spr(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Spr(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Spr(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SYR2 CLBlastStatusCode CLBlastSsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syr2(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syr2(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syr2(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SPR2 CLBlastStatusCode CLBlastSspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Spr2(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Spr2(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Spr2(static_cast(layout), static_cast(triangle), n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // GEMM CLBlastStatusCode CLBlastSgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SYMM CLBlastStatusCode CLBlastSsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // HEMM CLBlastStatusCode CLBlastChemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Hemm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Hemm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SYRK CLBlastStatusCode CLBlastSsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, alpha, a_buffer, a_offset, a_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, alpha, a_buffer, a_offset, a_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, alpha, a_buffer, a_offset, a_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // HERK CLBlastStatusCode CLBlastCherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Herk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, alpha, a_buffer, a_offset, a_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Herk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, alpha, a_buffer, a_offset, a_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // SYR2K CLBlastStatusCode CLBlastSsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // HER2K CLBlastStatusCode CLBlastCher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Her2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Her2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // TRMM CLBlastStatusCode CLBlastStrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // TRSM CLBlastStatusCode CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // ================================================================================================= // Extra non-BLAS routines (level-X) // ================================================================================================= // HAD CLBlastStatusCode CLBlastShad(const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const float beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Had(n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, beta, z_buffer, z_offset, z_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDhad(const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const double beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Had(n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, beta, z_buffer, z_offset, z_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastChad(const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const cl_float2 beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Had(n, float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, float2{beta.s[0], beta.s[1]}, z_buffer, z_offset, z_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZhad(const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const cl_double2 beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Had(n, double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, double2{beta.s[0], beta.s[1]}, z_buffer, z_offset, z_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHhad(const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const cl_half beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Had(n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, beta, z_buffer, z_offset, z_inc, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // OMATCOPY CLBlastStatusCode CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Omatcopy(static_cast(layout), static_cast(a_transpose), m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Omatcopy(static_cast(layout), static_cast(a_transpose), m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastComatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Omatcopy(static_cast(layout), static_cast(a_transpose), m, n, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Omatcopy(static_cast(layout), static_cast(a_transpose), m, n, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Omatcopy(static_cast(layout), static_cast(a_transpose), m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // IM2COL CLBlastStatusCode CLBlastSim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Im2col(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, im_buffer, im_offset, col_buffer, col_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Im2col(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, im_buffer, im_offset, col_buffer, col_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Im2col(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, im_buffer, im_offset, col_buffer, col_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Im2col(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, im_buffer, im_offset, col_buffer, col_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHim2col(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Im2col(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, im_buffer, im_offset, col_buffer, col_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // COL2IM CLBlastStatusCode CLBlastScol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Col2im(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer, col_offset, im_buffer, im_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDcol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Col2im(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer, col_offset, im_buffer, im_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCcol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Col2im(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer, col_offset, im_buffer, im_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZcol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Col2im(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer, col_offset, im_buffer, im_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHcol2im(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Col2im(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer, col_offset, im_buffer, im_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // CONVGEMM CLBlastStatusCode CLBlastSconvgemm(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Convgemm(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count, im_buffer, im_offset, kernel_buffer, kernel_offset, result_buffer, result_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDconvgemm(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Convgemm(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count, im_buffer, im_offset, kernel_buffer, kernel_offset, result_buffer, result_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHconvgemm(const CLBlastKernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::Convgemm(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count, im_buffer, im_offset, kernel_buffer, kernel_offset, result_buffer, result_offset, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // AXPY CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, const float *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); } try { return static_cast( clblast::AxpyBatched(n, alphas_cpp.data(), x_buffer, x_offsets, x_inc, y_buffer, y_offsets, y_inc, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, const double *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); } try { return static_cast( clblast::AxpyBatched(n, alphas_cpp.data(), x_buffer, x_offsets, x_inc, y_buffer, y_offsets, y_inc, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, const cl_float2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(float2{alphas[batch].s[0], alphas[batch].s[1]}); } try { return static_cast( clblast::AxpyBatched(n, alphas_cpp.data(), x_buffer, x_offsets, x_inc, y_buffer, y_offsets, y_inc, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, const cl_double2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(double2{alphas[batch].s[0], alphas[batch].s[1]}); } try { return static_cast( clblast::AxpyBatched(n, alphas_cpp.data(), x_buffer, x_offsets, x_inc, y_buffer, y_offsets, y_inc, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, const cl_half *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); } try { return static_cast( clblast::AxpyBatched(n, alphas_cpp.data(), x_buffer, x_offsets, x_inc, y_buffer, y_offsets, y_inc, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // GEMM CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const float *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); auto betas_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); betas_cpp.push_back(betas[batch]); } try { return static_cast( clblast::GemmBatched(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alphas_cpp.data(), a_buffer, a_offsets, a_ld, b_buffer, b_offsets, b_ld, betas_cpp.data(), c_buffer, c_offsets, c_ld, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const double *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); auto betas_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); betas_cpp.push_back(betas[batch]); } try { return static_cast( clblast::GemmBatched(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alphas_cpp.data(), a_buffer, a_offsets, a_ld, b_buffer, b_offsets, b_ld, betas_cpp.data(), c_buffer, c_offsets, c_ld, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_float2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); auto betas_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(float2{alphas[batch].s[0], alphas[batch].s[1]}); betas_cpp.push_back(float2{betas[batch].s[0], betas[batch].s[1]}); } try { return static_cast( clblast::GemmBatched(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alphas_cpp.data(), a_buffer, a_offsets, a_ld, b_buffer, b_offsets, b_ld, betas_cpp.data(), c_buffer, c_offsets, c_ld, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_double2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); auto betas_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(double2{alphas[batch].s[0], alphas[batch].s[1]}); betas_cpp.push_back(double2{betas[batch].s[0], betas[batch].s[1]}); } try { return static_cast( clblast::GemmBatched(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alphas_cpp.data(), a_buffer, a_offsets, a_ld, b_buffer, b_offsets, b_ld, betas_cpp.data(), c_buffer, c_offsets, c_ld, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_half *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); auto betas_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); betas_cpp.push_back(betas[batch]); } try { return static_cast( clblast::GemmBatched(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alphas_cpp.data(), a_buffer, a_offsets, a_ld, b_buffer, b_offsets, b_ld, betas_cpp.data(), c_buffer, c_offsets, c_ld, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // GEMM CLBlastStatusCode CLBlastSgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::GemmStridedBatched(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, beta, c_buffer, c_offset, c_ld, c_stride, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::GemmStridedBatched(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, beta, c_buffer, c_offset, c_ld, c_stride, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::GemmStridedBatched(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, c_stride, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::GemmStridedBatched(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, c_stride, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { return static_cast( clblast::GemmStridedBatched(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, beta, c_buffer, c_offset, c_ld, c_stride, batch_count, queue, event) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // ================================================================================================= // GEMM with temporary buffer (optional, for advanced users) CLBlastStatusCode CLBlastSgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer) { try { return static_cast( clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event, temp_buffer) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer) { try { return static_cast( clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event, temp_buffer) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer) { try { return static_cast( clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event, temp_buffer) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer) { try { return static_cast( clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event, temp_buffer) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer) { try { return static_cast( clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, queue, event, temp_buffer) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // ================================================================================================= // GEMM get temporary buffer size CLBlastStatusCode CLBlastSGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size){ try { return static_cast( clblast::GemmTempBufferSize(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, a_offset, a_ld, b_offset, b_ld, c_offset, c_ld, queue, *temp_buffer_size) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastDGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size){ try { return static_cast( clblast::GemmTempBufferSize(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, a_offset, a_ld, b_offset, b_ld, c_offset, c_ld, queue, *temp_buffer_size) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastCGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size){ try { return static_cast( clblast::GemmTempBufferSize(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, a_offset, a_ld, b_offset, b_ld, c_offset, c_ld, queue, *temp_buffer_size) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastZGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size){ try { return static_cast( clblast::GemmTempBufferSize(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, a_offset, a_ld, b_offset, b_ld, c_offset, c_ld, queue, *temp_buffer_size) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } CLBlastStatusCode CLBlastHGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t* temp_buffer_size){ try { return static_cast( clblast::GemmTempBufferSize(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, a_offset, a_ld, b_offset, b_ld, c_offset, c_ld, queue, *temp_buffer_size) ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // ================================================================================================= // Clears the cache of stored binaries CLBlastStatusCode CLBlastClearCache() { try { return static_cast(clblast::ClearCache()); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // Fills the cache with binaries for a specific device CLBlastStatusCode CLBlastFillCache(const cl_device_id device) { try { return static_cast(clblast::FillCache(device)); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // ================================================================================================= // Overrides the tuning parameters for this device-precision-kernel combination CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name, const CLBlastPrecision precision, const size_t num_parameters, const char** parameters_names, const size_t* parameters_values) { try { const auto kernel_name_cpp = std::string(kernel_name); const auto precision_cpp = static_cast(precision); auto parameters = std::unordered_map(); for (auto i = size_t{0}; i < num_parameters; ++i) { const auto parameter_name = std::string(parameters_names[i]); const auto parameter_value = parameters_values[i]; parameters[parameter_name] = parameter_value; } const auto status = clblast::OverrideParameters(device, kernel_name_cpp, precision_cpp, parameters); return static_cast(status); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } // ================================================================================================= CLBlast-1.6.3/src/clblast_cuda.cpp000066400000000000000000005117301463263031500167320ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements all the BLAS API calls (CUDA version). In all cases, it does not much more // than creating a new object of the appropriate type, and calling the main routine on that object. // It forwards all status codes to the caller. // // ================================================================================================= #include #include "routines/routines.hpp" #include "clblast_cuda.h" namespace clblast { // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // Generate givens plane rotation: SROTG/DROTG template StatusCode Rotg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Rotg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); // Generate modified givens plane rotation: SROTMG/DROTMG template StatusCode Rotmg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotmg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Rotmg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); // Apply givens plane rotation: SROT/DROT template StatusCode Rot(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const T, const T, const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rot(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const float, const float, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Rot(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const double, const double, const CUcontext, const CUdevice); // Apply modified givens plane rotation: SROTM/DROTM template StatusCode Rotm(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotm(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Rotm(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template StatusCode Swap(const size_t n, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xswap(queue_cpp, nullptr); routine.DoSwap(n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template StatusCode Scal(const size_t n, const T alpha, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xscal(queue_cpp, nullptr); routine.DoScal(n, alpha, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Scal(const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Scal(const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Scal(const size_t, const float2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Scal(const size_t, const double2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Scal(const size_t, const half, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template StatusCode Copy(const size_t n, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xcopy(queue_cpp, nullptr); routine.DoCopy(n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template StatusCode Axpy(const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xaxpy(queue_cpp, nullptr); routine.DoAxpy(n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Axpy(const size_t, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Axpy(const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Axpy(const size_t, const float2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Axpy(const size_t, const double2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Axpy(const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Dot product of two vectors: SDOT/DDOT/HDOT template StatusCode Dot(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xdot(queue_cpp, nullptr); routine.DoDot(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dot(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Dot(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Dot(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Dot product of two complex vectors: CDOTU/ZDOTU template StatusCode Dotu(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xdotu(queue_cpp, nullptr); routine.DoDotu(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dotu(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Dotu(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template StatusCode Dotc(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xdotc(queue_cpp, nullptr); routine.DoDotc(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dotc(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Dotc(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template StatusCode Nrm2(const size_t n, CUdeviceptr nrm2_buffer, const size_t nrm2_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xnrm2(queue_cpp, nullptr); routine.DoNrm2(n, Buffer(nrm2_buffer), nrm2_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template StatusCode Asum(const size_t n, CUdeviceptr asum_buffer, const size_t asum_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xasum(queue_cpp, nullptr); routine.DoAsum(n, Buffer(asum_buffer), asum_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template StatusCode Sum(const size_t n, CUdeviceptr sum_buffer, const size_t sum_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsum(queue_cpp, nullptr); routine.DoSum(n, Buffer(sum_buffer), sum_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template StatusCode Amax(const size_t n, CUdeviceptr imax_buffer, const size_t imax_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xamax(queue_cpp, nullptr); routine.DoAmax(n, Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN template StatusCode Amin(const size_t n, CUdeviceptr imin_buffer, const size_t imin_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xamin(queue_cpp, nullptr); routine.DoAmin(n, Buffer(imin_buffer), imin_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template StatusCode Max(const size_t n, CUdeviceptr imax_buffer, const size_t imax_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xmax(queue_cpp, nullptr); routine.DoMax(n, Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template StatusCode Min(const size_t n, CUdeviceptr imin_buffer, const size_t imin_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xmin(queue_cpp, nullptr); routine.DoMin(n, Buffer(imin_buffer), imin_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgemv(queue_cpp, nullptr); routine.DoGemv(layout, a_transpose, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgbmv(queue_cpp, nullptr); routine.DoGbmv(layout, a_transpose, m, n, kl, ku, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template StatusCode Hemv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhemv(queue_cpp, nullptr); routine.DoHemv(layout, triangle, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template StatusCode Hbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhbmv(queue_cpp, nullptr); routine.DoHbmv(layout, triangle, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template StatusCode Hpmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr ap_buffer, const size_t ap_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhpmv(queue_cpp, nullptr); routine.DoHpmv(layout, triangle, n, alpha, Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const size_t, const float2, const CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const size_t, const double2, const CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsymv(queue_cpp, nullptr); routine.DoSymv(layout, triangle, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template StatusCode Sbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsbmv(queue_cpp, nullptr); routine.DoSbmv(layout, triangle, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template StatusCode Spmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr ap_buffer, const size_t ap_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xspmv(queue_cpp, nullptr); routine.DoSpmv(layout, triangle, n, alpha, Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const float, const CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtrmv(queue_cpp, nullptr); routine.DoTrmv(layout, triangle, a_transpose, diagonal, n, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtbmv(queue_cpp, nullptr); routine.DoTbmv(layout, triangle, a_transpose, diagonal, n, k, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const CUdeviceptr ap_buffer, const size_t ap_offset, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtpmv(queue_cpp, nullptr); routine.DoTpmv(layout, triangle, a_transpose, diagonal, n, Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtrsv(queue_cpp, nullptr); routine.DoTrsv(layout, triangle, a_transpose, diagonal, n, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // General rank-1 matrix update: SGER/DGER/HGER template StatusCode Ger(const Layout layout, const size_t m, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xger(queue_cpp, nullptr); routine.DoGer(layout, m, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // General rank-1 complex matrix update: CGERU/ZGERU template StatusCode Geru(const Layout layout, const size_t m, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgeru(queue_cpp, nullptr); routine.DoGeru(layout, m, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Geru(const Layout, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Geru(const Layout, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // General rank-1 complex conjugated matrix update: CGERC/ZGERC template StatusCode Gerc(const Layout layout, const size_t m, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgerc(queue_cpp, nullptr); routine.DoGerc(layout, m, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gerc(const Layout, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gerc(const Layout, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Hermitian rank-1 matrix update: CHER/ZHER template StatusCode Her(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xher,T>(queue_cpp, nullptr); routine.DoHer(layout, triangle, n, alpha, Buffer>(x_buffer), x_offset, x_inc, Buffer>(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her(const Layout, const Triangle, const size_t, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Her(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Hermitian packed rank-1 matrix update: CHPR/ZHPR template StatusCode Hpr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr ap_buffer, const size_t ap_offset, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhpr,T>(queue_cpp, nullptr); routine.DoHpr(layout, triangle, n, alpha, Buffer>(x_buffer), x_offset, x_inc, Buffer>(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, const size_t, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); // Hermitian rank-2 matrix update: CHER2/ZHER2 template StatusCode Her2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xher2(queue_cpp, nullptr); routine.DoHer2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her2(const Layout, const Triangle, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Her2(const Layout, const Triangle, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 template StatusCode Hpr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr ap_buffer, const size_t ap_offset, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhpr2(queue_cpp, nullptr); routine.DoHpr2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template StatusCode Syr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsyr(queue_cpp, nullptr); routine.DoSyr(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template StatusCode Spr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr ap_buffer, const size_t ap_offset, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xspr(queue_cpp, nullptr); routine.DoSpr(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template StatusCode Syr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsyr2(queue_cpp, nullptr); routine.DoSyr2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template StatusCode Spr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr ap_buffer, const size_t ap_offset, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xspr2(queue_cpp, nullptr); routine.DoSpr2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device, CUdeviceptr temp_buffer) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgemm(queue_cpp, nullptr); const auto temp_buffer_provided = temp_buffer != 0; auto temp_buffer_cpp = temp_buffer_provided ? Buffer(temp_buffer) : Buffer(0); routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld, temp_buffer_cpp, temp_buffer_provided); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice, CUdeviceptr); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice, CUdeviceptr); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice, CUdeviceptr); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice, CUdeviceptr); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice, CUdeviceptr); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsymm(queue_cpp, nullptr); routine.DoSymm(layout, side, triangle, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhemm(queue_cpp, nullptr); routine.DoHemm(layout, side, triangle, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsyrk(queue_cpp, nullptr); routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Rank-K update of a hermitian matrix: CHERK/ZHERK template StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xherk,T>(queue_cpp, nullptr); routine.DoHerk(layout, triangle, a_transpose, n, k, alpha, Buffer>(a_buffer), a_offset, a_ld, beta, Buffer>(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsyr2k(queue_cpp, nullptr); routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const U beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xher2k(queue_cpp, nullptr); routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtrmm(queue_cpp, nullptr); routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtrsm(queue_cpp, nullptr); routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // ================================================================================================= // Extra non-BLAS routines (level-X) // ================================================================================================= // Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD template StatusCode Had(const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const T beta, CUdeviceptr z_buffer, const size_t z_offset, const size_t z_inc, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhad(queue_cpp, nullptr); routine.DoHad(n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, beta, Buffer(z_buffer), z_offset, z_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Had(const size_t, const float, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Had(const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Had(const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Had(const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Had(const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY template StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xomatcopy(queue_cpp, nullptr); routine.DoOmatcopy(layout, a_transpose, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const CUcontext, const CUdevice); // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL template StatusCode Im2col(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const CUdeviceptr im_buffer, const size_t im_offset, CUdeviceptr col_buffer, const size_t col_offset, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xim2col(queue_cpp, nullptr); routine.DoIm2col(kernel_mode, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, Buffer(im_buffer), im_offset, Buffer(col_buffer), col_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM template StatusCode Col2im(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const CUdeviceptr col_buffer, const size_t col_offset, CUdeviceptr im_buffer, const size_t im_offset, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xcol2im(queue_cpp, nullptr); routine.DoCol2im(kernel_mode, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, Buffer(col_buffer), col_offset, Buffer(im_buffer), im_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM template StatusCode Convgemm(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const CUdeviceptr im_buffer, const size_t im_offset, const CUdeviceptr kernel_buffer, const size_t kernel_offset, CUdeviceptr result_buffer, const size_t result_offset, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xconvgemm(queue_cpp, nullptr); routine.DoConvgemm(kernel_mode, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count, Buffer(im_buffer), im_offset, Buffer(kernel_buffer), kernel_offset, Buffer(result_buffer), result_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Convgemm(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Convgemm(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API Convgemm(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUcontext, const CUdevice); // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template StatusCode AxpyBatched(const size_t n, const T *alphas, const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = XaxpyBatched(queue_cpp, nullptr); auto alphas_cpp = std::vector(); auto x_offsets_cpp = std::vector(); auto y_offsets_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); x_offsets_cpp.push_back(x_offsets[batch]); y_offsets_cpp.push_back(y_offsets[batch]); } routine.DoAxpyBatched(n, alphas_cpp, Buffer(x_buffer), x_offsets_cpp, x_inc, Buffer(y_buffer), y_offsets_cpp, y_inc, batch_count); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API AxpyBatched(const size_t, const float*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API AxpyBatched(const size_t, const float2*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double2*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API AxpyBatched(const size_t, const half*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, const CUcontext, const CUdevice); // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED template StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T *alphas, const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld, const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld, const T *betas, CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = XgemmBatched(queue_cpp, nullptr); auto alphas_cpp = std::vector(); auto betas_cpp = std::vector(); auto a_offsets_cpp = std::vector(); auto b_offsets_cpp = std::vector(); auto c_offsets_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); betas_cpp.push_back(betas[batch]); a_offsets_cpp.push_back(a_offsets[batch]); b_offsets_cpp.push_back(b_offsets[batch]); c_offsets_cpp.push_back(c_offsets[batch]); } routine.DoGemmBatched(layout, a_transpose, b_transpose, m, n, k, alphas_cpp, Buffer(a_buffer), a_offsets_cpp, a_ld, Buffer(b_buffer), b_offsets_cpp, b_ld, betas_cpp, Buffer(c_buffer), c_offsets_cpp, c_ld, batch_count); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float*, const CUdeviceptr, const size_t*, const size_t, const CUdeviceptr, const size_t*, const size_t, const float*, CUdeviceptr, const size_t*, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double*, const CUdeviceptr, const size_t*, const size_t, const CUdeviceptr, const size_t*, const size_t, const double*, CUdeviceptr, const size_t*, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2*, const CUdeviceptr, const size_t*, const size_t, const CUdeviceptr, const size_t*, const size_t, const float2*, CUdeviceptr, const size_t*, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2*, const CUdeviceptr, const size_t*, const size_t, const CUdeviceptr, const size_t*, const size_t, const double2*, CUdeviceptr, const size_t*, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half*, const CUdeviceptr, const size_t*, const size_t, const CUdeviceptr, const size_t*, const size_t, const half*, CUdeviceptr, const size_t*, const size_t, const size_t, const CUcontext, const CUdevice); // StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED template StatusCode GemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, const CUcontext context, const CUdevice device) { try { const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = XgemmStridedBatched(queue_cpp, nullptr); routine.DoGemmStridedBatched(layout, a_transpose, b_transpose, m, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, a_stride, Buffer(b_buffer), b_offset, b_ld, b_stride, beta, Buffer(c_buffer), c_offset, c_ld, c_stride, batch_count); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float, const CUdeviceptr, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, const size_t, const size_t, const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, const size_t, const size_t, const CUcontext, const CUdevice); // ================================================================================================= // Retrieves the required size of the temporary buffer for the GEMM kernel (optional) template StatusCode GemmTempBufferSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, const CUdevice device, size_t& temp_buffer_size) { try { // Retrieves the tuning database const auto device_cpp = Device(device); const auto kernel_names = std::vector{"Xgemm", "GemmRoutine"}; Databases db(kernel_names); Routine::InitDatabase(device_cpp, kernel_names, PrecisionValue(), {}, db); // Computes the buffer size if (Xgemm::UseDirectKernel(m, n, k, db["XGEMM_MIN_INDIRECT_SIZE"])) { temp_buffer_size = 0; } else { temp_buffer_size = Xgemm::GetTempSize(layout, a_transpose, b_transpose, m, n, k, a_offset, a_ld, b_offset, b_ld, c_offset, c_ld, db["MWG"], db["NWG"], db["KWG"] * db["KREG"], db["GEMMK"]); } temp_buffer_size *= sizeof(T); // translate from num-elements to bytes return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdevice, size_t&); template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdevice, size_t&); template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdevice, size_t&); template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdevice, size_t&); template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdevice, size_t&); // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/clblast_netlib_c.cpp000066400000000000000000007552121463263031500176020ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the Netlib CBLAS implementations to the CLBlast BLAS routines, performing buffer // copies automatically and running on the default OpenCL platform and device. For full control over // performance, it is advised to use the regular clblast.h or clblast_c.h headers instead. // // ================================================================================================= #include #include "clblast_netlib_c.h" #include "clblast.h" #include "utilities/utilities.hpp" // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; // Option to make OpenCL device and context static to avoid re-creation upon multiple calls to the // Netlib API. Disadvantage is that they are not cleaned-up until program termination. #ifdef NETLIB_PERSISTENT_OPENCL #define OPTIONAL_STATIC static #else #define OPTIONAL_STATIC #endif // Helper function to get a default OpenCL platform and device clblast::Device get_device() { auto platform_id = clblast::ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}); auto device_id = clblast::ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}); auto platform = clblast::Platform(platform_id); return clblast::Device(platform, device_id); } // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // ROTG void cblas_srotg(float* sa, float* sb, float* sc, float* ss) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto sa_size = 1; const auto sb_size = 1; const auto sc_size = 1; const auto ss_size = 1; auto sa_buffer = clblast::Buffer(context, sa_size); auto sb_buffer = clblast::Buffer(context, sb_size); auto sc_buffer = clblast::Buffer(context, sc_size); auto ss_buffer = clblast::Buffer(context, ss_size); sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); auto queue_cl = queue(); auto s = clblast::Rotg(sa_buffer(), 0, sb_buffer(), 0, sc_buffer(), 0, ss_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); sc_buffer.Read(queue, sc_size, reinterpret_cast(sc)); ss_buffer.Read(queue, ss_size, reinterpret_cast(ss)); } void cblas_drotg(double* sa, double* sb, double* sc, double* ss) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto sa_size = 1; const auto sb_size = 1; const auto sc_size = 1; const auto ss_size = 1; auto sa_buffer = clblast::Buffer(context, sa_size); auto sb_buffer = clblast::Buffer(context, sb_size); auto sc_buffer = clblast::Buffer(context, sc_size); auto ss_buffer = clblast::Buffer(context, ss_size); sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); auto queue_cl = queue(); auto s = clblast::Rotg(sa_buffer(), 0, sb_buffer(), 0, sc_buffer(), 0, ss_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); sc_buffer.Read(queue, sc_size, reinterpret_cast(sc)); ss_buffer.Read(queue, ss_size, reinterpret_cast(ss)); } // ROTMG void cblas_srotmg(float* sd1, float* sd2, float* sx1, const float sy1, float* sparam) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto sy1_size = 1; const auto sd1_size = 1; const auto sd2_size = 1; const auto sx1_size = 1; const auto sparam_size = 1; auto sy1_buffer = clblast::Buffer(context, sy1_size); float sy1_vec[1]; sy1_vec[0] = sy1; auto sd1_buffer = clblast::Buffer(context, sd1_size); auto sd2_buffer = clblast::Buffer(context, sd2_size); auto sx1_buffer = clblast::Buffer(context, sx1_size); auto sparam_buffer = clblast::Buffer(context, sparam_size); sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1_vec)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); auto queue_cl = queue(); auto s = clblast::Rotmg(sd1_buffer(), 0, sd2_buffer(), 0, sx1_buffer(), 0, sy1_buffer(), 0, sparam_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); sx1_buffer.Read(queue, sx1_size, reinterpret_cast(sx1)); sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); } void cblas_drotmg(double* sd1, double* sd2, double* sx1, const double sy1, double* sparam) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto sy1_size = 1; const auto sd1_size = 1; const auto sd2_size = 1; const auto sx1_size = 1; const auto sparam_size = 1; auto sy1_buffer = clblast::Buffer(context, sy1_size); double sy1_vec[1]; sy1_vec[0] = sy1; auto sd1_buffer = clblast::Buffer(context, sd1_size); auto sd2_buffer = clblast::Buffer(context, sd2_size); auto sx1_buffer = clblast::Buffer(context, sx1_size); auto sparam_buffer = clblast::Buffer(context, sparam_size); sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1_vec)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); auto queue_cl = queue(); auto s = clblast::Rotmg(sd1_buffer(), 0, sd2_buffer(), 0, sx1_buffer(), 0, sy1_buffer(), 0, sparam_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); sx1_buffer.Read(queue, sx1_size, reinterpret_cast(sx1)); sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); } // ROT void cblas_srot(const int n, float* x, const int x_inc, float* y, const int y_inc, const float cos, const float sin) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Rot(n, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, cos, sin, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_drot(const int n, double* x, const int x_inc, double* y, const int y_inc, const double cos, const double sin) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Rot(n, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, cos, sin, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); } // ROTM void cblas_srotm(const int n, float* x, const int x_inc, float* y, const int y_inc, float* sparam) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto sparam_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto sparam_buffer = clblast::Buffer(context, sparam_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); auto queue_cl = queue(); auto s = clblast::Rotm(n, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, sparam_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); } void cblas_drotm(const int n, double* x, const int x_inc, double* y, const int y_inc, double* sparam) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto sparam_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto sparam_buffer = clblast::Buffer(context, sparam_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); auto queue_cl = queue(); auto s = clblast::Rotm(n, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, sparam_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); } // SWAP void cblas_sswap(const int n, float* x, const int x_inc, float* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Swap(n, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_dswap(const int n, double* x, const int x_inc, double* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Swap(n, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_cswap(const int n, void* x, const int x_inc, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Swap(n, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_zswap(const int n, void* x, const int x_inc, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Swap(n, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); } // SCAL void cblas_sscal(const int n, const float alpha, float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Scal(n, alpha_cpp, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_dscal(const int n, const double alpha, double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Scal(n, alpha_cpp, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_cscal(const int n, const void* alpha, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Scal(n, alpha_cpp, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_zscal(const int n, const void* alpha, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Scal(n, alpha_cpp, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } // COPY void cblas_scopy(const int n, const float* x, const int x_inc, float* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Copy(n, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_dcopy(const int n, const double* x, const int x_inc, double* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Copy(n, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_ccopy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Copy(n, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_zcopy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Copy(n, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } // AXPY void cblas_saxpy(const int n, const float alpha, const float* x, const int x_inc, float* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Axpy(n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_daxpy(const int n, const double alpha, const double* x, const int x_inc, double* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Axpy(n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_caxpy(const int n, const void* alpha, const void* x, const int x_inc, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Axpy(n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_zaxpy(const int n, const void* alpha, const void* x, const int x_inc, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Axpy(n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } // DOT float cblas_sdot(const int n, const float* x, const int x_inc, const float* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Dot(n, dot_buffer(), 0, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } float dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); return dot[0]; } double cblas_ddot(const int n, const double* x, const int x_inc, const double* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Dot(n, dot_buffer(), 0, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } double dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); return dot[0]; } // DOTU void cblas_cdotu_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Dotu(n, dot_buffer(), 0, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } void cblas_zdotu_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Dotu(n, dot_buffer(), 0, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } // DOTC void cblas_cdotc_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Dotc(n, dot_buffer(), 0, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } void cblas_zdotc_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Dotc(n, dot_buffer(), 0, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } // NRM2 float cblas_snrm2(const int n, const float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto nrm2_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Nrm2(n, nrm2_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } float nrm2[nrm2_size]; nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); return nrm2[0]; } double cblas_dnrm2(const int n, const double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto nrm2_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Nrm2(n, nrm2_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } double nrm2[nrm2_size]; nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); return nrm2[0]; } float cblas_scnrm2(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto nrm2_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Nrm2(n, nrm2_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } float2 nrm2[nrm2_size]; nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); return nrm2[0].real(); } double cblas_dznrm2(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto nrm2_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Nrm2(n, nrm2_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } double2 nrm2[nrm2_size]; nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); return nrm2[0].real(); } // ASUM float cblas_sasum(const int n, const float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto asum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Asum(n, asum_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } float asum[asum_size]; asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); return asum[0]; } double cblas_dasum(const int n, const double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto asum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Asum(n, asum_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } double asum[asum_size]; asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); return asum[0]; } float cblas_scasum(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto asum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Asum(n, asum_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } float2 asum[asum_size]; asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); return asum[0].real(); } double cblas_dzasum(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto asum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Asum(n, asum_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } double2 asum[asum_size]; asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); return asum[0].real(); } // SUM float cblas_ssum(const int n, const float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto sum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Sum(n, sum_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } float sum[sum_size]; sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); return sum[0]; } double cblas_dsum(const int n, const double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto sum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Sum(n, sum_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } double sum[sum_size]; sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); return sum[0]; } float cblas_scsum(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto sum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Sum(n, sum_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } float2 sum[sum_size]; sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); return sum[0].real(); } double cblas_dzsum(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto sum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Sum(n, sum_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } double2 sum[sum_size]; sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); return sum[0].real(); } // AMAX int cblas_isamax(const int n, const float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Amax(n, imax_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imax[imax_size]; imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); return imax[0]; } int cblas_idamax(const int n, const double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Amax(n, imax_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imax[imax_size]; imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); return imax[0]; } int cblas_icamax(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Amax(n, imax_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imax[imax_size]; imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); return imax[0]; } int cblas_izamax(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Amax(n, imax_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imax[imax_size]; imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); return imax[0]; } // AMIN int cblas_isamin(const int n, const float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Amin(n, imin_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imin[imin_size]; imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); return imin[0]; } int cblas_idamin(const int n, const double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Amin(n, imin_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imin[imin_size]; imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); return imin[0]; } int cblas_icamin(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Amin(n, imin_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imin[imin_size]; imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); return imin[0]; } int cblas_izamin(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Amin(n, imin_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imin[imin_size]; imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); return imin[0]; } // MAX int cblas_ismax(const int n, const float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Max(n, imax_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imax[imax_size]; imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); return imax[0]; } int cblas_idmax(const int n, const double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Max(n, imax_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imax[imax_size]; imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); return imax[0]; } int cblas_icmax(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Max(n, imax_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imax[imax_size]; imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); return imax[0]; } int cblas_izmax(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Max(n, imax_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imax[imax_size]; imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); return imax[0]; } // MIN int cblas_ismin(const int n, const float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Min(n, imin_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imin[imin_size]; imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); return imin[0]; } int cblas_idmin(const int n, const double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Min(n, imin_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imin[imin_size]; imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); return imin[0]; } int cblas_icmin(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Min(n, imin_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imin[imin_size]; imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); return imin[0]; } int cblas_izmin(const int n, const void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Min(n, imin_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } int imin[imin_size]; imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); return imin[0]; } // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // GEMV void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const double alpha, const double* a, const int a_ld, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } // GBMV void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const double alpha, const double* a, const int a_ld, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } // HEMV void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Hemv(static_cast(layout), static_cast(triangle), n, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Hemv(static_cast(layout), static_cast(triangle), n, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } // HBMV void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Hbmv(static_cast(layout), static_cast(triangle), n, k, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Hbmv(static_cast(layout), static_cast(triangle), n, k, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } // HPMV void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* ap, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto ap_buffer = clblast::Buffer(context, ap_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Hpmv(static_cast(layout), static_cast(triangle), n, alpha_cpp, ap_buffer(), 0, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* ap, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto ap_buffer = clblast::Buffer(context, ap_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Hpmv(static_cast(layout), static_cast(triangle), n, alpha_cpp, ap_buffer(), 0, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } // SYMV void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Symv(static_cast(layout), static_cast(triangle), n, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* a, const int a_ld, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Symv(static_cast(layout), static_cast(triangle), n, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } // SBMV void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Sbmv(static_cast(layout), static_cast(triangle), n, k, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const double alpha, const double* a, const int a_ld, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Sbmv(static_cast(layout), static_cast(triangle), n, k, alpha_cpp, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } // SPMV void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* ap, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto ap_buffer = clblast::Buffer(context, ap_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Spmv(static_cast(layout), static_cast(triangle), n, alpha_cpp, ap_buffer(), 0, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* ap, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; const auto y_size = n * y_inc; auto ap_buffer = clblast::Buffer(context, ap_size); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); auto s = clblast::Spmv(static_cast(layout), static_cast(triangle), n, alpha_cpp, ap_buffer(), 0, x_buffer(), 0, x_inc, beta_cpp, y_buffer(), 0, y_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } // TRMV void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* a, const int a_ld, float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* a, const int a_ld, double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } // TBMV void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } // TPMV void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* ap, float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; auto ap_buffer = clblast::Buffer(context, ap_size); auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* ap, double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; auto ap_buffer = clblast::Buffer(context, ap_size); auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; auto ap_buffer = clblast::Buffer(context, ap_size); auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; auto ap_buffer = clblast::Buffer(context, ap_size); auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } // TRSV void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* a, const int a_ld, float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* a, const int a_ld, double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } // TBSV void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; auto a_buffer = clblast::Buffer(context, a_size); auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, k, a_buffer(), 0, a_ld, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } // TPSV void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* ap, float* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; auto ap_buffer = clblast::Buffer(context, ap_size); auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* ap, double* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; auto ap_buffer = clblast::Buffer(context, ap_size); auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; auto ap_buffer = clblast::Buffer(context, ap_size); auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; auto ap_buffer = clblast::Buffer(context, ap_size); auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); auto s = clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), n, ap_buffer(), 0, x_buffer(), 0, x_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } // GER void cblas_sger(const CLBlastLayout layout, const int m, const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, float* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Ger(static_cast(layout), m, n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } void cblas_dger(const CLBlastLayout layout, const int m, const int n, const double alpha, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Ger(static_cast(layout), m, n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } // GERU void cblas_cgeru(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Geru(static_cast(layout), m, n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } void cblas_zgeru(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Geru(static_cast(layout), m, n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } // GERC void cblas_cgerc(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Gerc(static_cast(layout), m, n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } void cblas_zgerc(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Gerc(static_cast(layout), m, n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } // HER void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const void* x, const int x_inc, void* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto a_size = n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Her(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const void* x, const int x_inc, void* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto a_size = n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Her(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } // HPR void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const void* x, const int x_inc, void* ap) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto ap_size = ((n*(n+1)) / 2); auto x_buffer = clblast::Buffer(context, x_size); auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); auto s = clblast::Hpr(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, ap_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const void* x, const int x_inc, void* ap) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto ap_size = ((n*(n+1)) / 2); auto x_buffer = clblast::Buffer(context, x_size); auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); auto s = clblast::Hpr(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, ap_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } // HER2 void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto a_size = n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Her2(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto a_size = n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Her2(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } // HPR2 void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* ap) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto ap_size = ((n*(n+1)) / 2); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); auto s = clblast::Hpr2(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, ap_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* ap) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto ap_size = ((n*(n+1)) / 2); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); auto s = clblast::Hpr2(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, ap_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } // SYR void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, float* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto a_size = n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Syr(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, double* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto a_size = n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Syr(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } // SPR void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, float* ap) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto ap_size = ((n*(n+1)) / 2); auto x_buffer = clblast::Buffer(context, x_size); auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); auto s = clblast::Spr(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, ap_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, double* ap) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto ap_size = ((n*(n+1)) / 2); auto x_buffer = clblast::Buffer(context, x_size); auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); auto s = clblast::Spr(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, ap_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } // SYR2 void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, float* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto a_size = n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Syr2(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto a_size = n * a_ld; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); auto s = clblast::Syr2(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, a_buffer(), 0, a_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } // SPR2 void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, float* ap) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto ap_size = ((n*(n+1)) / 2); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); auto s = clblast::Spr2(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, ap_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, const double* y, const int y_inc, double* ap) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto ap_size = ((n*(n+1)) / 2); auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); auto s = clblast::Spr2(static_cast(layout), static_cast(triangle), n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, ap_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // GEMM void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const float alpha, const float* a, const int a_ld, const float* b, const int b_ld, const float beta, float* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const double alpha, const double* a, const int a_ld, const double* b, const int b_ld, const double beta, double* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), m, n, k, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } // SYMM void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const float alpha, const float* a, const int a_ld, const float* b, const int b_ld, const float beta, float* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const double alpha, const double* a, const int a_ld, const double* b, const int b_ld, const double beta, double* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } // HEMM void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Hemm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Hemm(static_cast(layout), static_cast(side), static_cast(triangle), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } // SYRK void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const float alpha, const float* a, const int a_ld, const float beta, float* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, alpha_cpp, a_buffer(), 0, a_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const double alpha, const double* a, const int a_ld, const double beta, double* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, alpha_cpp, a_buffer(), 0, a_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, alpha_cpp, a_buffer(), 0, a_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, alpha_cpp, a_buffer(), 0, a_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } // HERK void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const float alpha, const void* a, const int a_ld, const float beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Herk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, alpha_cpp, a_buffer(), 0, a_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const double alpha, const void* a, const int a_ld, const double beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Herk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), n, k, alpha_cpp, a_buffer(), 0, a_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } // SYR2K void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const float alpha, const float* a, const int a_ld, const float* b, const int b_ld, const float beta, float* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const double alpha, const double* a, const int a_ld, const double* b, const int b_ld, const double beta, double* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } // HER2K void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const float beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Her2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const double beta, void* c, const int c_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); auto s = clblast::Her2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), n, k, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, beta_cpp, c_buffer(), 0, c_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } // TRMM void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, const float* a, const int a_ld, float* b, const int b_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); auto s = clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const double alpha, const double* a, const int a_ld, double* b, const int b_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); auto s = clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); auto s = clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); auto s = clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } // TRSM void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, const float* a, const int a_ld, float* b, const int b_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); auto s = clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const double alpha, const double* a, const int a_ld, double* b, const int b_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); auto s = clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); auto s = clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); auto s = clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), static_cast(a_transpose), static_cast(diagonal), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } // ================================================================================================= // Extra non-BLAS routines (level-X) // ================================================================================================= // HAD void cblas_shad(const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, const float beta, float* z, const int z_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto z_size = n * z_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto z_buffer = clblast::Buffer(context, z_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); z_buffer.Write(queue, z_size, reinterpret_cast(z)); auto queue_cl = queue(); auto s = clblast::Had(n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, beta_cpp, z_buffer(), 0, z_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } z_buffer.Read(queue, z_size, reinterpret_cast(z)); } void cblas_dhad(const int n, const double alpha, const double* x, const int x_inc, const double* y, const int y_inc, const double beta, double* z, const int z_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto z_size = n * z_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto z_buffer = clblast::Buffer(context, z_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); z_buffer.Write(queue, z_size, reinterpret_cast(z)); auto queue_cl = queue(); auto s = clblast::Had(n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, beta_cpp, z_buffer(), 0, z_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } z_buffer.Read(queue, z_size, reinterpret_cast(z)); } void cblas_chad(const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, const void* beta, void* z, const int z_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto z_size = n * z_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto z_buffer = clblast::Buffer(context, z_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); z_buffer.Write(queue, z_size, reinterpret_cast(z)); auto queue_cl = queue(); auto s = clblast::Had(n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, beta_cpp, z_buffer(), 0, z_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } z_buffer.Read(queue, z_size, reinterpret_cast(z)); } void cblas_zhad(const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, const void* beta, void* z, const int z_inc) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto z_size = n * z_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); auto z_buffer = clblast::Buffer(context, z_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); z_buffer.Write(queue, z_size, reinterpret_cast(z)); auto queue_cl = queue(); auto s = clblast::Had(n, alpha_cpp, x_buffer(), 0, x_inc, y_buffer(), 0, y_inc, beta_cpp, z_buffer(), 0, z_inc, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } z_buffer.Read(queue, z_size, reinterpret_cast(z)); } // OMATCOPY void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const float alpha, const float* a, const int a_ld, float* b, const int b_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); auto s = clblast::Omatcopy(static_cast(layout), static_cast(a_transpose), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const double alpha, const double* a, const int a_ld, double* b, const int b_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); auto s = clblast::Omatcopy(static_cast(layout), static_cast(a_transpose), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); auto s = clblast::Omatcopy(static_cast(layout), static_cast(a_transpose), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; auto a_buffer = clblast::Buffer(context, a_size); auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); auto s = clblast::Omatcopy(static_cast(layout), static_cast(a_transpose), m, n, alpha_cpp, a_buffer(), 0, a_ld, b_buffer(), 0, b_ld, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } // IM2COL void cblas_sim2col(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const float* im, float* col) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto im_size = height * width * channels; const auto col_size = height * width * channels; auto im_buffer = clblast::Buffer(context, im_size); auto col_buffer = clblast::Buffer(context, col_size); im_buffer.Write(queue, im_size, reinterpret_cast(im)); col_buffer.Write(queue, col_size, reinterpret_cast(col)); auto queue_cl = queue(); auto s = clblast::Im2col(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, im_buffer(), 0, col_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } col_buffer.Read(queue, col_size, reinterpret_cast(col)); } void cblas_dim2col(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const double* im, double* col) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto im_size = height * width * channels; const auto col_size = height * width * channels; auto im_buffer = clblast::Buffer(context, im_size); auto col_buffer = clblast::Buffer(context, col_size); im_buffer.Write(queue, im_size, reinterpret_cast(im)); col_buffer.Write(queue, col_size, reinterpret_cast(col)); auto queue_cl = queue(); auto s = clblast::Im2col(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, im_buffer(), 0, col_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } col_buffer.Read(queue, col_size, reinterpret_cast(col)); } void cblas_cim2col(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const void* im, void* col) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto im_size = height * width * channels; const auto col_size = height * width * channels; auto im_buffer = clblast::Buffer(context, im_size); auto col_buffer = clblast::Buffer(context, col_size); im_buffer.Write(queue, im_size, reinterpret_cast(im)); col_buffer.Write(queue, col_size, reinterpret_cast(col)); auto queue_cl = queue(); auto s = clblast::Im2col(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, im_buffer(), 0, col_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } col_buffer.Read(queue, col_size, reinterpret_cast(col)); } void cblas_zim2col(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const void* im, void* col) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto im_size = height * width * channels; const auto col_size = height * width * channels; auto im_buffer = clblast::Buffer(context, im_size); auto col_buffer = clblast::Buffer(context, col_size); im_buffer.Write(queue, im_size, reinterpret_cast(im)); col_buffer.Write(queue, col_size, reinterpret_cast(col)); auto queue_cl = queue(); auto s = clblast::Im2col(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, im_buffer(), 0, col_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } col_buffer.Read(queue, col_size, reinterpret_cast(col)); } // COL2IM void cblas_scol2im(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const float* col, float* im) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto col_size = height * width * channels; const auto im_size = height * width * channels; auto col_buffer = clblast::Buffer(context, col_size); auto im_buffer = clblast::Buffer(context, im_size); col_buffer.Write(queue, col_size, reinterpret_cast(col)); im_buffer.Write(queue, im_size, reinterpret_cast(im)); auto queue_cl = queue(); auto s = clblast::Col2im(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer(), 0, im_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } im_buffer.Read(queue, im_size, reinterpret_cast(im)); } void cblas_dcol2im(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const double* col, double* im) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto col_size = height * width * channels; const auto im_size = height * width * channels; auto col_buffer = clblast::Buffer(context, col_size); auto im_buffer = clblast::Buffer(context, im_size); col_buffer.Write(queue, col_size, reinterpret_cast(col)); im_buffer.Write(queue, im_size, reinterpret_cast(im)); auto queue_cl = queue(); auto s = clblast::Col2im(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer(), 0, im_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } im_buffer.Read(queue, im_size, reinterpret_cast(im)); } void cblas_ccol2im(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const void* col, void* im) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto col_size = height * width * channels; const auto im_size = height * width * channels; auto col_buffer = clblast::Buffer(context, col_size); auto im_buffer = clblast::Buffer(context, im_size); col_buffer.Write(queue, col_size, reinterpret_cast(col)); im_buffer.Write(queue, im_size, reinterpret_cast(im)); auto queue_cl = queue(); auto s = clblast::Col2im(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer(), 0, im_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } im_buffer.Read(queue, im_size, reinterpret_cast(im)); } void cblas_zcol2im(const CLBlastKernelMode kernel_mode, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const void* col, void* im) { OPTIONAL_STATIC auto device = get_device(); OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto col_size = height * width * channels; const auto im_size = height * width * channels; auto col_buffer = clblast::Buffer(context, col_size); auto im_buffer = clblast::Buffer(context, im_size); col_buffer.Write(queue, col_size, reinterpret_cast(col)); im_buffer.Write(queue, im_size, reinterpret_cast(im)); auto queue_cl = queue(); auto s = clblast::Col2im(static_cast(kernel_mode), channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer(), 0, im_buffer(), 0, &queue_cl); if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } im_buffer.Read(queue, im_size, reinterpret_cast(im)); } // ================================================================================================= CLBlast-1.6.3/src/clpp11.hpp000066400000000000000000001056671463263031500154270ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API // calls. The main benefits are increased abstraction, automatic memory management, and portability. // Portability here means that a similar header exists for CUDA with the same classes and // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. // // This file is taken from the CLCudaAPI project and // therefore contains the following header copyright notice: // // ================================================================================================= // // Copyright 2015 SURFsara // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // ================================================================================================= #ifndef CLBLAST_CLPP11_H_ #define CLBLAST_CLPP11_H_ // C++ #include // std::copy #include // std::string #include // std::vector #include // std::shared_ptr #include // std::accumulate #include // std::strlen #include // fprintf, stderr #include // OpenCL #define CL_TARGET_OPENCL_VERSION 110 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings #define CL_USE_DEPRECATED_OPENCL_2_0_APIS // to disable deprecation warnings #if defined(__APPLE__) || defined(__MACOSX) #include #else #include #endif // Android support (missing C++11 functions to_string, stod, and stoi) #ifdef __ANDROID__ #include "utilities/android.hpp" #endif // Exception classes #include "cxpp11_common.hpp" namespace clblast { // ================================================================================================= // Represents a runtime error returned by an OpenCL API function class CLCudaAPIError : public ErrorCode { public: explicit CLCudaAPIError(cl_int status, const std::string &where): ErrorCode(status, where, "OpenCL error: " + where + ": " + std::to_string(static_cast(status))) { } static void Check(const cl_int status, const std::string &where) { if (status != CL_SUCCESS) { throw CLCudaAPIError(status, where); } } static void CheckDtor(const cl_int status, const std::string &where) { if (status != CL_SUCCESS) { fprintf(stderr, "CLBlast: %s (ignoring)\n", CLCudaAPIError(status, where).what()); } } }; // Exception returned when building a program using CLCudaAPIBuildError = CLCudaAPIError; // ================================================================================================= // Error occurred in OpenCL #define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call)) // Error occurred in OpenCL (no-exception version for destructors) #define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call)) // ================================================================================================= // C++11 version of 'cl_event' class Event { public: // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere explicit Event(const cl_event event): event_(new cl_event) { *event_ = event; } // Regular constructor with memory management explicit Event(): event_(new cl_event, [](cl_event* e) { if (*e) { CheckErrorDtor(clReleaseEvent(*e)); } delete e; }) { *event_ = nullptr; } // Waits for completion of this event void WaitForCompletion() const { CheckError(clWaitForEvents(1, &(*event_))); } // Retrieves the elapsed time of the last recorded event. // (Note that there is a bug in Apple's OpenCL implementation of the 'clGetEventProfilingInfo' function: // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx) // However, in our case the reply size is fixed to be cl_ulong, so we are not affected. float GetElapsedTime() const { WaitForCompletion(); const auto bytes = sizeof(cl_ulong); auto time_start = cl_ulong{0}; CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr)); auto time_end = cl_ulong{0}; CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr)); return static_cast(time_end - time_start) * 1.0e-6f; } // Accessor to the private data-member cl_event& operator()() { return *event_; } const cl_event& operator()() const { return *event_; } cl_event* pointer() { return &(*event_); } const cl_event* pointer() const { return &(*event_); } private: std::shared_ptr event_; }; // Pointer to an OpenCL event using EventPointer = cl_event*; // ================================================================================================= // Raw platform ID type using RawPlatformID = cl_platform_id; // C++11 version of 'cl_platform_id' class Platform { public: // Constructor based on the regular OpenCL data-type explicit Platform(const cl_platform_id platform): platform_(platform) { } // Initializes the platform explicit Platform(const size_t platform_id) { auto num_platforms = cl_uint{0}; CheckError(clGetPlatformIDs(0, nullptr, &num_platforms)); if (num_platforms == 0) { throw RuntimeError("Platform: no platforms found"); } if (platform_id >= num_platforms) { throw RuntimeError("Platform: invalid platform ID "+std::to_string(platform_id)); } auto platforms = std::vector(num_platforms); CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr)); platform_ = platforms[platform_id]; } // Methods to retrieve platform information std::string Name() const { return GetInfoString(CL_PLATFORM_NAME); } std::string Vendor() const { return GetInfoString(CL_PLATFORM_VENDOR); } std::string Version() const { return GetInfoString(CL_PLATFORM_VERSION); } // Returns the number of devices on this platform size_t NumDevices() const { auto result = cl_uint{0}; CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result)); return static_cast(result); } // Accessor to the private data-member const RawPlatformID& operator()() const { return platform_; } private: cl_platform_id platform_; // Private helper functions std::string GetInfoString(const cl_device_info info) const { auto bytes = size_t{0}; CheckError(clGetPlatformInfo(platform_, info, 0, nullptr, &bytes)); auto result = std::string{}; result.resize(bytes); CheckError(clGetPlatformInfo(platform_, info, bytes, &result[0], nullptr)); result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters return result; } }; // Retrieves a vector with all platforms inline std::vector GetAllPlatforms() { auto num_platforms = cl_uint{0}; CheckError(clGetPlatformIDs(0, nullptr, &num_platforms)); auto all_platforms = std::vector(); for (size_t platform_id = 0; platform_id < static_cast(num_platforms); ++platform_id) { all_platforms.push_back(Platform(platform_id)); } return all_platforms; } // ================================================================================================= // Raw device ID type using RawDeviceID = cl_device_id; // C++11 version of 'cl_device_id' class Device { public: // Constructor based on the regular OpenCL data-type explicit Device(const cl_device_id device): device_(device) { } // Initialize the device. Note that this constructor can throw exceptions! explicit Device(const Platform &platform, const size_t device_id) { auto num_devices = platform.NumDevices(); if (num_devices == 0) { throw RuntimeError("Device: no devices found"); } if (device_id >= num_devices) { throw RuntimeError("Device: invalid device ID "+std::to_string(device_id)); } auto devices = std::vector(num_devices); CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast(num_devices), devices.data(), nullptr)); device_ = devices[device_id]; } // Methods to retrieve device information RawPlatformID PlatformID() const { return GetInfo(CL_DEVICE_PLATFORM); } std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); } size_t VersionNumber() const { std::string version_string = Version().substr(7); // Space separates the end of the OpenCL version number from the beginning of the // vendor-specific information. size_t next_whitespace = version_string.find(' '); size_t version = (size_t) (100.0 * std::stod(version_string.substr(0, next_whitespace))); return version; } std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); } std::string Name() const { return GetInfoString(CL_DEVICE_NAME); } std::string Type() const { auto type = GetInfo(CL_DEVICE_TYPE); switch(type) { case CL_DEVICE_TYPE_CPU: return "CPU"; case CL_DEVICE_TYPE_GPU: return "GPU"; case CL_DEVICE_TYPE_ACCELERATOR: return "accelerator"; default: return "default"; } } size_t MaxWorkGroupSize() const { return GetInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE); } size_t MaxWorkItemDimensions() const { return static_cast(GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS)); } std::vector MaxWorkItemSizes() const { return GetInfoVector(CL_DEVICE_MAX_WORK_ITEM_SIZES); } unsigned long LocalMemSize() const { return static_cast(GetInfo(CL_DEVICE_LOCAL_MEM_SIZE)); } std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); } bool HasExtension(const std::string &extension) const { const auto extensions = Capabilities(); return extensions.find(extension) != std::string::npos; } bool SupportsFP64() const { return HasExtension("cl_khr_fp64"); } bool SupportsFP16() const { if (Name() == "Mali-T628") { return true; } // supports fp16 but not cl_khr_fp16 officially return HasExtension("cl_khr_fp16"); } size_t CoreClock() const { return static_cast(GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY)); } size_t ComputeUnits() const { return static_cast(GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS)); } unsigned long MemorySize() const { return static_cast(GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE)); } unsigned long MaxAllocSize() const { return static_cast(GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE)); } size_t MemoryClock() const { return 0; } // Not exposed in OpenCL size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL // Configuration-validity checks bool IsLocalMemoryValid(const cl_ulong local_mem_usage) const { return (local_mem_usage <= LocalMemSize()); } bool IsThreadConfigValid(const std::vector &local) const { auto local_size = size_t{1}; for (const auto &item: local) { local_size *= item; } for (auto i=size_t{0}; i MaxWorkItemSizes()[i]) { return false; } } if (local_size > MaxWorkGroupSize()) { return false; } if (local.size() > MaxWorkItemDimensions()) { return false; } return true; } // Query for a specific type of device or brand bool IsCPU() const { return Type() == "CPU"; } bool IsGPU() const { return Type() == "GPU"; } bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." || Vendor() == "AuthenticAMD"; } bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; } bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" || Vendor() == "GenuineIntel" || Vendor() == "Intel(R) Corporation"; } bool IsARM() const { return Vendor() == "ARM"; } bool IsQualcomm() const { return Vendor() == "QUALCOMM"; } // Platform specific extensions std::string AMDBoardName() const { // check for 'cl_amd_device_attribute_query' first #ifndef CL_DEVICE_BOARD_NAME_AMD #define CL_DEVICE_BOARD_NAME_AMD 0x4038 #endif return GetInfoString(CL_DEVICE_BOARD_NAME_AMD); } std::string NVIDIAComputeCapability() const { // check for 'cl_nv_device_attribute_query' first #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 #endif #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 #endif return std::string{"SM"} + std::to_string(GetInfo(CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV)) + std::string{"."} + std::to_string(GetInfo(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV)); } // Returns if the Nvidia chip is a Volta or later archicture (sm_70 or higher) bool IsPostNVIDIAVolta() const { if(HasExtension("cl_nv_device_attribute_query")) { return GetInfo(CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV) >= 7; } return false; } // Returns the Qualcomm Adreno GPU version (i.e. a650, a730, a740, etc.) std::string AdrenoVersion() const { if (IsQualcomm()) { return GetInfoString(CL_DEVICE_OPENCL_C_VERSION); } else { return std::string{""}; } } // Retrieves the above extra information (if present) std::string GetExtraInfo() const { if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); } if (HasExtension("cl_nv_device_attribute_query")) { return NVIDIAComputeCapability(); } else { return std::string{""}; } } // Accessor to the private data-member const RawDeviceID& operator()() const { return device_; } private: cl_device_id device_; // Private helper functions template T GetInfo(const cl_device_info info) const { auto bytes = size_t{0}; CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); auto result = T(0); CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr)); return result; } template std::vector GetInfoVector(const cl_device_info info) const { auto bytes = size_t{0}; CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); auto result = std::vector(bytes/sizeof(T)); CheckError(clGetDeviceInfo(device_, info, bytes, result.data(), nullptr)); return result; } std::string GetInfoString(const cl_device_info info) const { auto bytes = size_t{0}; CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); auto result = std::string{}; result.resize(bytes); CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr)); result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters return result; } }; // ================================================================================================= // Raw context type using RawContext = cl_context; // C++11 version of 'cl_context' class Context { public: // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere explicit Context(const cl_context context): context_(new cl_context) { *context_ = context; } // Regular constructor with memory management explicit Context(const Device &device): context_(new cl_context, [](cl_context* c) { if (*c) { CheckErrorDtor(clReleaseContext(*c)); } delete c; }) { auto status = CL_SUCCESS; const cl_device_id dev = device(); *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status); CLCudaAPIError::Check(status, "clCreateContext"); } // Accessor to the private data-member const RawContext& operator()() const { return *context_; } RawContext* pointer() const { return &(*context_); } private: std::shared_ptr context_; }; // Pointer to an OpenCL context using ContextPointer = cl_context*; // ================================================================================================= // C++11 version of 'cl_program'. class Program { public: // Source-based constructor with memory management explicit Program(const Context &context, const std::string &source) { #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND const std::string source_null_kernel = source + "\n__kernel void null_kernel() {}\n"; const char *source_ptr = &source_null_kernel[0]; const auto length = source_null_kernel.length(); #else const char *source_ptr = &source[0]; const auto length = source.length(); #endif auto status = CL_SUCCESS; program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status); CLCudaAPIError::Check(status, "clCreateProgramWithSource"); } // Binary-based constructor with memory management explicit Program(const Device &device, const Context &context, const std::string &binary) { const char *binary_ptr = &binary[0]; const auto length = binary.length(); auto status1 = CL_SUCCESS; auto status2 = CL_SUCCESS; const auto dev = device(); program_ = clCreateProgramWithBinary(context(), 1, &dev, &length, reinterpret_cast(&binary_ptr), &status1, &status2); CLCudaAPIError::Check(status1, "clCreateProgramWithBinary (binary status)"); CLCudaAPIError::Check(status2, "clCreateProgramWithBinary"); } // Clean-up ~Program() { // Causes an access violation under Windows or Android when the driver is already unloaded #if !defined(__ANDROID__) && !defined(_MSC_VER) if (program_) { CheckErrorDtor(clReleaseProgram(program_)); } #endif } // Compiles the device program and checks whether or not there are any warnings/errors void Build(const Device &device, std::vector &options) { auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "}); const cl_device_id dev = device(); CheckError(clBuildProgram(program_, 1, &dev, options_string.c_str(), nullptr, nullptr)); } // Confirms whether a certain status code is an actual compilation error or warning bool StatusIsCompilationWarningOrError(const cl_int status) const { return (status == CL_BUILD_PROGRAM_FAILURE); } // Retrieves the warning/error message from the compiler (if any) std::string GetBuildInfo(const Device &device) const { auto bytes = size_t{0}; auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG}; CheckError(clGetProgramBuildInfo(program_, device(), query, 0, nullptr, &bytes)); auto result = std::string{}; result.resize(bytes); CheckError(clGetProgramBuildInfo(program_, device(), query, bytes, &result[0], nullptr)); return result; } // Retrieves a binary or an intermediate representation of the compiled program std::string GetIR() const { cl_uint num_devices = 0; CheckError(clGetProgramInfo(program_, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, nullptr)); std::vector binSizesInBytes(num_devices, 0); CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binSizesInBytes.data(), nullptr)); auto bytes = size_t{0}; auto binSizeIter = size_t{0}; // Loop over the program binary sizes to find a binary whose size is > 0. // The current logic assumes that there ever is only one valid program binary // in a given cl_program. This should be the case unless the cl_program // is built for all or a subset of devices associated to a given cl_program for (; binSizeIter < binSizesInBytes.size(); ++binSizeIter) { if (binSizesInBytes[binSizeIter] > 0) { bytes = binSizesInBytes[binSizeIter]; break; } } auto result = std::string{}; result.resize(bytes); std::vector out(num_devices, nullptr); out[binSizeIter] = const_cast(result.data()); CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES, num_devices * sizeof(char*), out.data(), nullptr)); return result; } // Accessor to the private data-member const cl_program& operator()() const { return program_; } private: cl_program program_ = nullptr; }; // ================================================================================================= // Raw command-queue type using RawCommandQueue = cl_command_queue; // C++11 version of 'cl_command_queue' class Queue { public: // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere explicit Queue(const cl_command_queue queue): queue_(new cl_command_queue) { *queue_ = queue; } // Regular constructor with memory management explicit Queue(const Context &context, const Device &device): queue_(new cl_command_queue, [](cl_command_queue* s) { if (*s) { CheckErrorDtor(clReleaseCommandQueue(*s)); } delete s; }) { auto status = CL_SUCCESS; *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); CLCudaAPIError::Check(status, "clCreateCommandQueue"); } // Synchronizes the queue void Finish(Event &) const { Finish(); } void Finish() const { CheckError(clFinish(*queue_)); } // Retrieves the corresponding context or device Context GetContext() const { auto bytes = size_t{0}; CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes)); cl_context result; CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr)); return Context(result); } Device GetDevice() const { auto bytes = size_t{0}; CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes)); cl_device_id result; CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr)); return Device(result); } // Accessor to the private data-member const RawCommandQueue& operator()() const { return *queue_; } private: std::shared_ptr queue_; }; // ================================================================================================= // C++11 version of host memory template class BufferHost { public: // Regular constructor with memory management explicit BufferHost(const Context &, const size_t size): buffer_(new std::vector(size)) { } // Retrieves the actual allocated size in bytes size_t GetSize() const { return buffer_->size()*sizeof(T); } // Compatibility with std::vector size_t size() const { return buffer_->size(); } T* begin() { return &(*buffer_)[0]; } T* end() { return &(*buffer_)[buffer_->size()-1]; } T& operator[](const size_t i) { return (*buffer_)[i]; } T* data() { return buffer_->data(); } const T* data() const { return buffer_->data(); } private: std::shared_ptr> buffer_; }; // ================================================================================================= // Enumeration of buffer access types enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }; // C++11 version of 'cl_mem' template class Buffer { public: // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere explicit Buffer(const cl_mem buffer): buffer_(new cl_mem), access_(BufferAccess::kNotOwned) { *buffer_ = buffer; } // Regular constructor with memory management. If this class does not own the buffer object, then // the memory will not be freed automatically afterwards. If the size is set to 0, this will // become a stub containing a nullptr explicit Buffer(const Context &context, const BufferAccess access, const size_t size): buffer_(new cl_mem, [access, size](cl_mem* m) { if (access != BufferAccess::kNotOwned && size > 0) { CheckError(clReleaseMemObject(*m)); } delete m; }), access_(access) { auto flags = cl_mem_flags{CL_MEM_READ_WRITE}; if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; } if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; } auto status = CL_SUCCESS; *buffer_ = (size > 0) ? clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status) : nullptr; CLCudaAPIError::Check(status, "clCreateBuffer"); } // As above, but now with read/write access as a default explicit Buffer(const Context &context, const size_t size): Buffer(context, BufferAccess::kReadWrite, size) { } // Constructs a new buffer based on an existing host-container template explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end): Buffer(context, BufferAccess::kReadWrite, static_cast(end - start)) { auto size = static_cast(end - start); auto pointer = &*start; CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0, nullptr, nullptr)); queue.Finish(); } // Copies from device to host: reading the device buffer a-synchronously void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { if (access_ == BufferAccess::kWriteOnly) { throw LogicError("Buffer: reading from a write-only buffer"); } CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T), host, 0, nullptr, nullptr)); } void ReadAsync(const Queue &queue, const size_t size, std::vector &host, const size_t offset = 0) const { if (host.size() < size) { throw LogicError("Buffer: target host buffer is too small"); } ReadAsync(queue, size, host.data(), offset); } void ReadAsync(const Queue &queue, const size_t size, BufferHost &host, const size_t offset = 0) const { if (host.size() < size) { throw LogicError("Buffer: target host buffer is too small"); } ReadAsync(queue, size, host.data(), offset); } // Copies from device to host: reading the device buffer void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { ReadAsync(queue, size, host, offset); queue.Finish(); } void Read(const Queue &queue, const size_t size, std::vector &host, const size_t offset = 0) const { Read(queue, size, host.data(), offset); } void Read(const Queue &queue, const size_t size, BufferHost &host, const size_t offset = 0) const { Read(queue, size, host.data(), offset); } // Copies from host to device: writing the device buffer a-synchronously void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { if (access_ == BufferAccess::kReadOnly) { throw LogicError("Buffer: writing to a read-only buffer"); } if (GetSize() < (offset+size)*sizeof(T)) { throw LogicError("Buffer: target device buffer is too small"); } CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T), host, 0, nullptr, nullptr)); } void WriteAsync(const Queue &queue, const size_t size, const std::vector &host, const size_t offset = 0) { WriteAsync(queue, size, host.data(), offset); } void WriteAsync(const Queue &queue, const size_t size, const BufferHost &host, const size_t offset = 0) { WriteAsync(queue, size, host.data(), offset); } // Copies from host to device: writing the device buffer void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { WriteAsync(queue, size, host, offset); queue.Finish(); } void Write(const Queue &queue, const size_t size, const std::vector &host, const size_t offset = 0) { Write(queue, size, host.data(), offset); } void Write(const Queue &queue, const size_t size, const BufferHost &host, const size_t offset = 0) { Write(queue, size, host.data(), offset); } // Copies the contents of this buffer into another device buffer void CopyToAsync(const Queue &queue, const size_t size, const Buffer &destination, EventPointer event = nullptr) const { CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0, nullptr, event)); } void CopyTo(const Queue &queue, const size_t size, const Buffer &destination) const { CopyToAsync(queue, size, destination); queue.Finish(); } // Retrieves the actual allocated size in bytes size_t GetSize() const { const auto bytes = sizeof(size_t); auto result = size_t{0}; CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr)); return result; } // Accessor to the private data-member const cl_mem& operator()() const { return *buffer_; } private: std::shared_ptr buffer_; BufferAccess access_; }; // ================================================================================================= // C++11 version of 'cl_kernel' class Kernel { public: // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere explicit Kernel(const cl_kernel kernel): kernel_(new cl_kernel) { *kernel_ = kernel; } // Regular constructor with memory management explicit Kernel(const std::shared_ptr program, const std::string &name): kernel_(new cl_kernel, [](cl_kernel* k) { if (*k) { CheckErrorDtor(clReleaseKernel(*k)); } delete k; }) #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND , null_kernel_(new cl_kernel, [](cl_kernel* k) { if (*k) { CheckErrorDtor(clReleaseKernel(*k)); } delete k; }) #endif { auto status = CL_SUCCESS; *kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status); CLCudaAPIError::Check(status, "clCreateKernel"); #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND *null_kernel_ = clCreateKernel(program->operator()(), "null_kernel", &status); CLCudaAPIError::Check(status, "clCreateKernel"); #endif } // Sets a kernel argument at the indicated position template void SetArgument(const size_t index, const T &value) { CheckError(clSetKernelArg(*kernel_, static_cast(index), sizeof(T), &value)); } template void SetArgument(const size_t index, Buffer &value) { SetArgument(index, value()); } // Sets all arguments in one go using parameter packs. Note that this overwrites previously set // arguments using 'SetArgument' or 'SetArguments'. template void SetArguments(Args&... args) { SetArgumentsRecursive(0, args...); } // Retrieves the amount of local memory used per work-group for this kernel unsigned long LocalMemUsage(const Device &device) const { const auto bytes = sizeof(cl_ulong); auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE}; auto result = cl_ulong{0}; CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr)); return static_cast(result); } // Retrieves the name of the kernel std::string GetFunctionName() const { auto bytes = size_t{0}; CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes)); auto result = std::string{}; result.resize(bytes); CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, bytes, &result[0], nullptr)); return std::string{result.c_str()}; // Removes any trailing '\0'-characters } // Launches a kernel onto the specified queue void Launch(const Queue &queue, const std::vector &global, const std::vector &local, EventPointer event) { CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), nullptr, global.data(), local.data(), 0, nullptr, event)); } // As above, but with an event waiting list void Launch(const Queue &queue, const std::vector &global, const std::vector &local, EventPointer event, const std::vector &waitForEvents) { // Builds a plain version of the events waiting list auto waitForEventsPlain = std::vector(); for (auto &waitEvent : waitForEvents) { if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); } } // Launches the kernel while waiting for other events CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), nullptr, global.data(), !local.empty() ? local.data() : nullptr, static_cast(waitForEventsPlain.size()), !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr, event)); #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND const std::vector nullRange = {1}; CheckError(clEnqueueNDRangeKernel(queue(), *null_kernel_, static_cast(nullRange.size()), nullptr, nullRange.data(), nullptr, 0, nullptr, nullptr)); #endif } // Accessor to the private data-member const cl_kernel& operator()() const { return *kernel_; } private: std::shared_ptr kernel_; #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND std::shared_ptr null_kernel_; #endif // Internal implementation for the recursive SetArguments function. template void SetArgumentsRecursive(const size_t index, T &first) { SetArgument(index, first); } template void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { SetArgument(index, first); SetArgumentsRecursive(index+1, args...); } }; // ================================================================================================= } // namespace clblast // CLBLAST_CLPP11_H_ #endif CLBlast-1.6.3/src/cupp11.hpp000066400000000000000000000725531463263031500154350ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API // calls. The main benefits are increased abstraction, automatic memory management, and portability. // Portability here means that a similar header exists for CUDA with the same classes and // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. // // This file is taken from the CLCudaAPI project and // therefore contains the following header copyright notice: // // ================================================================================================= // // Copyright 2015 SURFsara // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // ================================================================================================= #ifndef CLBLAST_CUPP11_H_ #define CLBLAST_CUPP11_H_ // C++ #include // std::copy #include // std::string #include // std::vector #include // std::shared_ptr #include // std::strlen // CUDA #define CUDA_NO_HALF // Incompatible with CLBlast's definition; TODO: resolve this #include // CUDA driver API #include // NVIDIA runtime compilation API // Exception classes #include "cxpp11_common.hpp" namespace clblast { // ================================================================================================= // Max-length of strings constexpr auto kStringLength = 256; // ================================================================================================= // Represents a runtime error returned by a CUDA driver API function class CLCudaAPIError : public ErrorCode { public: explicit CLCudaAPIError(CUresult status, const std::string &where): ErrorCode(status, where, "CUDA error: " + where + ": " + GetErrorName(status) + " --> " + GetErrorString(status)) { } static void Check(const CUresult status, const std::string &where) { if (status != CUDA_SUCCESS) { throw CLCudaAPIError(status, where); } } static void CheckDtor(const CUresult status, const std::string &where) { if (status != CUDA_SUCCESS) { fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what()); } } private: std::string GetErrorName(CUresult status) const { const char* status_code; cuGetErrorName(status, &status_code); return std::string(status_code); } std::string GetErrorString(CUresult status) const { const char* status_string; cuGetErrorString(status, &status_string); return std::string(status_string); } }; // Represents a runtime error returned by a CUDA runtime compilation API function class CLCudaAPINVRTCError : public ErrorCode { public: explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where): ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) { } static void Check(const nvrtcResult status, const std::string &where) { if (status != NVRTC_SUCCESS) { throw CLCudaAPINVRTCError(status, where); } } static void CheckDtor(const nvrtcResult status, const std::string &where) { if (status != NVRTC_SUCCESS) { fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what()); } } private: std::string GetErrorString(nvrtcResult status) const { const char* status_string = nvrtcGetErrorString(status); return std::string(status_string); } }; // Exception returned when building a program using CLCudaAPIBuildError = CLCudaAPINVRTCError; // ================================================================================================= // Error occurred in CUDA driver or runtime compilation API #define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call)) #define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call)) // Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors) #define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call)) #define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call)) // ================================================================================================= // C++11 version of two 'CUevent' pointers class Event { public: // Note that there is no constructor based on the regular CUDA data-type because of extra state // Regular constructor with memory management explicit Event(): start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }), end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) { CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT)); CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT)); } // Waits for completion of this event (not implemented for CUDA) void WaitForCompletion() const { } // not needed due to cuStreamSynchronize call after each kernel launch // Retrieves the elapsed time of the last recorded event float GetElapsedTime() const { auto result = 0.0f; cuEventElapsedTime(&result, *start_, *end_); return result; } // Accessors to the private data-members const CUevent& start() const { return *start_; } const CUevent& end() const { return *end_; } Event* pointer() { return this; } private: std::shared_ptr start_; std::shared_ptr end_; }; // Pointer to a CUDA event using EventPointer = Event*; // ================================================================================================= // Raw platform ID type using RawPlatformID = size_t; // The CUDA platform: initializes the CUDA driver API class Platform { public: // Initializes the platform. Note that the platform ID variable is not actually used for CUDA. explicit Platform(const size_t platform_id) : platform_id_(0) { if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); } CheckError(cuInit(0)); } // Methods to retrieve platform information std::string Name() const { return "CUDA"; } std::string Vendor() const { return "NVIDIA Corporation"; } std::string Version() const { auto result = 0; CheckError(cuDriverGetVersion(&result)); return "CUDA driver "+std::to_string(result); } // Returns the number of devices on this platform size_t NumDevices() const { auto result = 0; CheckError(cuDeviceGetCount(&result)); return static_cast(result); } // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0) const RawPlatformID& operator()() const { return platform_id_; } private: const size_t platform_id_; }; // Retrieves a vector with all platforms. Note that there is just one platform in CUDA. inline std::vector GetAllPlatforms() { auto all_platforms = std::vector{ Platform(size_t{0}) }; return all_platforms; } // ================================================================================================= // Raw device ID type using RawDeviceID = CUdevice; // C++11 version of 'CUdevice' class Device { public: // Constructor based on the regular CUDA data-type explicit Device(const CUdevice device): device_(device) { } // Initialization explicit Device(const Platform &platform, const size_t device_id) { auto num_devices = platform.NumDevices(); if (num_devices == 0) { throw RuntimeError("Device: no devices found"); } if (device_id >= num_devices) { throw RuntimeError("Device: invalid device ID "+std::to_string(device_id)); } CheckError(cuDeviceGet(&device_, device_id)); } // Methods to retrieve device information RawPlatformID PlatformID() const { return 0; } std::string Version() const { auto result = 0; CheckError(cuDriverGetVersion(&result)); return "CUDA driver "+std::to_string(result); } size_t VersionNumber() const { auto result = 0; CheckError(cuDriverGetVersion(&result)); return static_cast(result); } std::string Vendor() const { return "NVIDIA Corporation"; } std::string Name() const { auto result = std::string{}; result.resize(kStringLength); CheckError(cuDeviceGetName(&result[0], result.size(), device_)); result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters return result; } std::string Type() const { return "GPU"; } size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); } size_t MaxWorkItemDimensions() const { return size_t{3}; } std::vector MaxWorkItemSizes() const { return std::vector{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X), GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y), GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)}; } unsigned long LocalMemSize() const { return static_cast(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)); } std::string Capabilities() const { const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); return "SM"+std::to_string(major)+"."+std::to_string(minor); } std::string ComputeArch() const { const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); return "compute_"+std::to_string(major)+std::to_string(minor); } bool HasExtension(const std::string &extension) const { return false; } bool SupportsFP64() const { return true; } bool SupportsFP16() const { const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); if (major > 5) { return true; } // SM 6.x, 7.x and higher if (major == 5 && minor == 3) { return true; } // SM 5.3 return false; } size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); } size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); } unsigned long MemorySize() const { auto result = size_t{0}; CheckError(cuDeviceTotalMem(&result, device_)); return static_cast(result); } unsigned long MaxAllocSize() const { return MemorySize(); } size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); } size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); } // Configuration-validity checks bool IsLocalMemoryValid(const size_t local_mem_usage) const { return (local_mem_usage <= LocalMemSize()); } bool IsThreadConfigValid(const std::vector &local) const { auto local_size = size_t{1}; for (const auto &item: local) { local_size *= item; } for (auto i=size_t{0}; i MaxWorkItemSizes()[i]) { return false; } } if (local_size > MaxWorkGroupSize()) { return false; } if (local.size() > MaxWorkItemDimensions()) { return false; } return true; } // Query for a specific type of device or brand bool IsCPU() const { return false; } bool IsGPU() const { return true; } bool IsAMD() const { return false; } bool IsNVIDIA() const { return true; } bool IsIntel() const { return false; } bool IsARM() const { return false; } bool IsQualcomm() const { return false; } // Platform specific extensions std::string AMDBoardName() const { return ""; } std::string NVIDIAComputeCapability() const { return Capabilities(); } // Returns if the Nvidia chip is a Volta or later archicture (major version 7 or higher) bool IsPostNVIDIAVolta() const { return GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 7; } // Retrieves the above extra information std::string GetExtraInfo() const { return NVIDIAComputeCapability(); } // Accessor to the private data-member const RawDeviceID& operator()() const { return device_; } private: CUdevice device_; // Private helper function size_t GetInfo(const CUdevice_attribute info) const { auto result = 0; CheckError(cuDeviceGetAttribute(&result, info, device_)); return static_cast(result); } }; // ================================================================================================= // Raw context type using RawContext = CUcontext; // C++11 version of 'CUcontext' class Context { public: // Constructor based on the regular CUDA data-type: memory management is handled elsewhere explicit Context(const CUcontext context): context_(new CUcontext) { *context_ = context; } // Regular constructor with memory management explicit Context(const Device &device): context_(new CUcontext, [](CUcontext* c) { if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); } delete c; }) { CheckError(cuCtxCreate(context_.get(), 0, device())); } // Accessor to the private data-member const RawContext& operator()() const { return *context_; } RawContext* pointer() const { return &(*context_); } private: std::shared_ptr context_; }; // Pointer to a raw CUDA context using ContextPointer = CUcontext*; // ================================================================================================= // C++11 version of 'nvrtcProgram'. Additionally holds the program's source code. class Program { public: Program() = default; // Note that there is no constructor based on the regular CUDA data-type because of extra state // Source-based constructor with memory management explicit Program(const Context &, std::string source): program_(new nvrtcProgram, [](nvrtcProgram* p) { if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); } delete p; }), source_(std::move(source)), from_binary_(false) { const auto source_ptr = &source_[0]; CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr)); } // PTX-based constructor explicit Program(const Device &device, const Context &context, const std::string &binary): program_(nullptr), // not used source_(binary), from_binary_(true) { } // Compiles the device program and checks whether or not there are any warnings/errors void Build(const Device &device, std::vector &options) { options.push_back("-arch=" + device.ComputeArch()); if (from_binary_) { return; } auto raw_options = std::vector(); for (const auto &option: options) { raw_options.push_back(option.c_str()); } auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data()); CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram"); CheckError(cuModuleLoadDataEx(&module_, GetIR().data(), 0, nullptr, nullptr)); } // Confirms whether a certain status code is an actual compilation error or warning bool StatusIsCompilationWarningOrError(const nvrtcResult status) const { return (status == NVRTC_ERROR_COMPILATION); } // Retrieves the warning/error message from the compiler (if any) std::string GetBuildInfo(const Device &) const { if (from_binary_) { return std::string{}; } auto bytes = size_t{0}; CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes)); auto result = std::string{}; result.resize(bytes); CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0])); return result; } // Retrieves an intermediate representation of the compiled program (i.e. PTX) std::string GetIR() const { if (from_binary_) { return source_; } // holds the PTX auto bytes = size_t{0}; CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes)); auto result = std::string{}; result.resize(bytes); CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0])); return result; } // Accessor to the private data-members const CUmodule GetModule() const { return module_; } const nvrtcProgram& operator()() const { return *program_; } private: std::shared_ptr program_; CUmodule module_; std::string source_; bool from_binary_; }; // ================================================================================================= // Raw command-queue type using RawCommandQueue = CUstream; // C++11 version of 'CUstream' class Queue { public: // Note that there is no constructor based on the regular CUDA data-type because of extra state // Regular constructor with memory management explicit Queue(const Context &context, const Device &device): queue_(new CUstream, [](CUstream* s) { if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); } delete s; }), context_(context), device_(device) { CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING)); } // Synchronizes the queue and optionally also an event void Finish(Event &event) const { CheckError(cuEventSynchronize(event.end())); Finish(); } void Finish() const { CheckError(cuStreamSynchronize(*queue_)); } // Retrieves the corresponding context or device Context GetContext() const { return context_; } Device GetDevice() const { return device_; } // Accessor to the private data-member const RawCommandQueue& operator()() const { return *queue_; } private: std::shared_ptr queue_; const Context context_; const Device device_; }; // ================================================================================================= // C++11 version of page-locked host memory template class BufferHost { public: // Regular constructor with memory management explicit BufferHost(const Context &, const size_t size): buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }), size_(size) { CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T))); } // Retrieves the actual allocated size in bytes size_t GetSize() const { return size_*sizeof(T); } // Compatibility with std::vector size_t size() const { return size_; } T* begin() { return &static_cast(*buffer_)[0]; } T* end() { return &static_cast(*buffer_)[size_-1]; } T& operator[](const size_t i) { return static_cast(*buffer_)[i]; } T* data() { return static_cast(*buffer_); } const T* data() const { return static_cast(*buffer_); } private: std::shared_ptr buffer_; const size_t size_; }; // ================================================================================================= // Enumeration of buffer access types enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }; // C++11 version of 'CUdeviceptr' template class Buffer { public: // Constructor based on the regular CUDA data-type: memory management is handled elsewhere explicit Buffer(const CUdeviceptr buffer): buffer_(new CUdeviceptr), access_(BufferAccess::kNotOwned) { *buffer_ = buffer; } // Regular constructor with memory management. If this class does not own the buffer object, then // the memory will not be freed automatically afterwards. explicit Buffer(const Context &, const BufferAccess access, const size_t size): buffer_(new CUdeviceptr, [access, size](CUdeviceptr* m) { if (access != BufferAccess::kNotOwned && size > 0) { CheckError(cuMemFree(*m)); } delete m; }), access_(access) { if (size > 0) { CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T))); } } // As above, but now with read/write access as a default explicit Buffer(const Context &context, const size_t size): Buffer(context, BufferAccess::kReadWrite, size) { } // Constructs a new buffer based on an existing host-container template explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end): Buffer(context, BufferAccess::kReadWrite, static_cast(end - start)) { auto size = static_cast(end - start); auto pointer = &*start; CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue())); queue.Finish(); } // Copies from device to host: reading the device buffer a-synchronously void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { if (access_ == BufferAccess::kWriteOnly) { throw LogicError("Buffer: reading from a write-only buffer"); } CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue())); } void ReadAsync(const Queue &queue, const size_t size, std::vector &host, const size_t offset = 0) const { if (host.size() < size) { throw LogicError("Buffer: target host buffer is too small"); } ReadAsync(queue, size, host.data(), offset); } void ReadAsync(const Queue &queue, const size_t size, BufferHost &host, const size_t offset = 0) const { if (host.size() < size) { throw LogicError("Buffer: target host buffer is too small"); } ReadAsync(queue, size, host.data(), offset); } // Copies from device to host: reading the device buffer void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { ReadAsync(queue, size, host, offset); queue.Finish(); } void Read(const Queue &queue, const size_t size, std::vector &host, const size_t offset = 0) const { Read(queue, size, host.data(), offset); } void Read(const Queue &queue, const size_t size, BufferHost &host, const size_t offset = 0) const { Read(queue, size, host.data(), offset); } // Copies from host to device: writing the device buffer a-synchronously void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { if (access_ == BufferAccess::kReadOnly) { throw LogicError("Buffer: writing to a read-only buffer"); } if (GetSize() < (offset+size)*sizeof(T)) { throw LogicError("Buffer: target device buffer is too small"); } CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue())); } void WriteAsync(const Queue &queue, const size_t size, const std::vector &host, const size_t offset = 0) { WriteAsync(queue, size, host.data(), offset); } void WriteAsync(const Queue &queue, const size_t size, const BufferHost &host, const size_t offset = 0) { WriteAsync(queue, size, host.data(), offset); } // Copies from host to device: writing the device buffer void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { WriteAsync(queue, size, host, offset); queue.Finish(); } void Write(const Queue &queue, const size_t size, const std::vector &host, const size_t offset = 0) { Write(queue, size, host.data(), offset); } void Write(const Queue &queue, const size_t size, const BufferHost &host, const size_t offset = 0) { Write(queue, size, host.data(), offset); } // Copies the contents of this buffer into another device buffer void CopyToAsync(const Queue &queue, const size_t size, const Buffer &destination, EventPointer event = nullptr) const { CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue())); } void CopyTo(const Queue &queue, const size_t size, const Buffer &destination) const { CopyToAsync(queue, size, destination); queue.Finish(); } // Retrieves the actual allocated size in bytes size_t GetSize() const { auto result = size_t{0}; CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_)); return result; } // Accessors to the private data-members CUdeviceptr operator()() const { return *buffer_; } CUdeviceptr& operator()() { return *buffer_; } private: std::shared_ptr buffer_; BufferAccess access_; }; // ================================================================================================= // C++11 version of 'CUfunction' class Kernel { public: // Constructor based on the regular CUDA data-type: memory management is handled elsewhere explicit Kernel(const CUfunction kernel): name_("unknown"), kernel_(kernel) { } // Regular constructor with memory management explicit Kernel(const std::shared_ptr program, const std::string &name): name_(name) { CheckError(cuModuleGetFunction(&kernel_, program->GetModule(), name.c_str())); } // Sets a kernel argument at the indicated position. This stores both the value of the argument // (as raw bytes) and the index indicating where this value can be found. template void SetArgument(const size_t index, const T &value) { if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); } arguments_indices_[index] = arguments_data_.size(); for (auto j=size_t(0); j(&value)[j]); } } template void SetArgument(const size_t index, Buffer &value) { SetArgument(index, value()); } // Sets all arguments in one go using parameter packs. Note that this resets all previously set // arguments using 'SetArgument' or 'SetArguments'. template void SetArguments(Args&... args) { arguments_indices_.clear(); arguments_data_.clear(); SetArgumentsRecursive(0, args...); } // Retrieves the amount of local memory used per work-group for this kernel. Note that this the // shared memory in CUDA terminology. unsigned long LocalMemUsage(const Device &) const { auto result = 0; CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_)); return static_cast(result); } // Retrieves the name of the kernel std::string GetFunctionName() const { return name_; } // Launches a kernel onto the specified queue void Launch(const Queue &queue, const std::vector &global, const std::vector &local, EventPointer event) { // TODO: Currently this CUDA launch is always synchronous due to a cuStreamSynchronize call if (local.size() == 0) { throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end"); } // Creates the grid (number of threadblocks) and sets the block sizes (threads per block) auto grid = std::vector{1, 1, 1}; auto block = std::vector{1, 1, 1}; if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); } for (auto i=size_t{0}; i pointers; for (auto &index: arguments_indices_) { pointers.push_back(&arguments_data_[index]); } // Launches the kernel, its execution time is recorded by events if (event) { CheckError(cuEventRecord(event->start(), queue())); } CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, queue(), pointers.data(), nullptr)); cuStreamSynchronize(queue()); if (event) { CheckError(cuEventRecord(event->end(), queue())); } } // As above, but with an event waiting list void Launch(const Queue &queue, const std::vector &global, const std::vector &local, EventPointer event, const std::vector& waitForEvents) { for (auto &waitEvent : waitForEvents) { waitEvent.WaitForCompletion(); // note: doesn't do anything, every kernel call is synchronous } return Launch(queue, global, local, event); } // Accessors to the private data-members const CUfunction& operator()() const { return kernel_; } CUfunction operator()() { return kernel_; } private: const std::string name_; CUfunction kernel_; std::vector arguments_indices_; // Indices of the arguments std::vector arguments_data_; // The arguments data as raw bytes // Internal implementation for the recursive SetArguments function. template void SetArgumentsRecursive(const size_t index, T &first) { SetArgument(index, first); } template void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { SetArgument(index, first); SetArgumentsRecursive(index+1, args...); } }; // ================================================================================================= } // namespace clblast // CLBLAST_CUPP11_H_ #endif CLBlast-1.6.3/src/cxpp11_common.hpp000066400000000000000000000067561463263031500170120ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Ivan Shapovalov // // This file contains exception classes corresponding to 'clpp11.hpp'. It is also part of the // CLCudaAPI project. See 'clpp11.hpp' for more details. // // ================================================================================================= #ifndef CLBLAST_CXPP11_COMMON_H_ #define CLBLAST_CXPP11_COMMON_H_ #include // strchr #include // std::string #include // std::runtime_error namespace clblast { // ================================================================================================= // Basic exception class: represents an error happened inside our code // (as opposed to an error in C++ runtime) template class Error : public Base { public: // Perfect forwarding of the constructor since "using Base::Base" is not supported by VS 2013 template Error(Args&&... args): Base(std::forward(args)...) { } }; // ================================================================================================= // Represents a generic device-specific runtime error (returned by an OpenCL or CUDA API function) class DeviceError : public Error { public: // Perfect forwarding of the constructor since "using Error::Error" is not // supported by VS 2013 template DeviceError(Args&&... args): Error(std::forward(args)...) { } static std::string TrimCallString(const char *where) { const char *paren = strchr(where, '('); if (paren) { return std::string(where, paren); } else { return std::string(where); } } }; // ================================================================================================= // Represents a generic runtime error (aka environmental problem) class RuntimeError : public Error { public: explicit RuntimeError(const std::string &reason): Error("Run-time error: " + reason) { } }; // ================================================================================================= // Represents a generic logic error (aka failed assertion) class LogicError : public Error { public: explicit LogicError(const std::string &reason): Error("Internal logic error: " + reason) { } }; // ================================================================================================= // Internal exception base class with a status field and a subclass-specific "details" field // which can be used to recreate an exception template class ErrorCode : public Base { public: ErrorCode(Status status, const std::string &details, const std::string &reason): Base(reason), status_(status), details_(details) { } Status status() const { return status_; } const std::string& details() const { return details_; } private: const Status status_; const std::string details_; }; // ================================================================================================= } // namespace clblast // CLBLAST_CXPP11_COMMON_H_ #endif CLBlast-1.6.3/src/database/000077500000000000000000000000001463263031500153435ustar00rootroot00000000000000CLBlast-1.6.3/src/database/apple_cpu_fallback.hpp000066400000000000000000000126071463263031500216510ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file provides overrides for Apple's OpenCL CPU implementation. It is a special case compared // to all other implementations, as it only supports a 1-dimensional work-group size. In addition, // that work-group size is limited to 1024 (in theory) or much lower (kernel resource dependent). // Thus, instead of supporting this corner-case in the whole regular flow (starting from the tuner), // we provide this file with some manual overrides. // // Note: These overrides are to make the Apple CPU work and not crash, they are not in any way // optimized parameters. For decent speed don't use Apple's OpenCL CPU implementation. // // ================================================================================================= namespace clblast { namespace database { // ================================================================================================= const DatabaseEntry XaxpyApple = { "Xaxpy", Precision::kAny, {"VW", "WGS", "WPT"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry XdotApple = { "Xdot", Precision::kAny, {"WGS1", "WGS2"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry XgemvApple = { "Xgemv", Precision::kAny, {"WGS1", "WPT1", "UNROLL1"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry XgemvFastApple = { "XgemvFast", Precision::kAny, {"VW2", "WGS2", "WPT2"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry XgemvFastRotApple = { "XgemvFastRot", Precision::kAny, {"VW3", "WGS3", "WPT3"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry XgerApple = { "Xger", Precision::kAny, {"WGS1", "WGS2", "WPT"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry XtrsvApple = { "Xtrsv", Precision::kAny, {"TRSV_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry XgemmApple = { "Xgemm", Precision::kAny, {"GEMMK", "KREG", "KWG", "KWI", "MDIMA", "MDIMC", "MWG", "NDIMB", "NDIMC", "NWG", "SA", "SB", "STRM", "STRN", "VWM", "VWN"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1 } } } } } } } }; const DatabaseEntry XgemmDirectApple = { "XgemmDirect", Precision::kAny, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry XconvgemmApple = { "Xconvgemm", Precision::kAny, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry CopyApple = { "Copy", Precision::kAny, {"COPY_DIMX", "COPY_DIMY", "COPY_VW", "COPY_WPT"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry PadApple = { "Pad", Precision::kAny, {"PAD_DIMX", "PAD_DIMY", "PAD_WPTX", "PAD_WPTY"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry TransposeApple = { "Transpose", Precision::kAny, {"TRA_DIM", "TRA_PAD", "TRA_SHUFFLE", "TRA_WPT"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry PadtransposeApple = { "Padtranspose", Precision::kAny, {"PADTRA_PAD", "PADTRA_TILE", "PADTRA_WPT"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry InvertApple = { "Invert", Precision::kAny, {"INTERNAL_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; const DatabaseEntry TrsvRoutineApple = { "TrsvRoutine", Precision::kAny, {"TRSV_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } }; // ================================================================================================= } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/database.cpp000066400000000000000000000311351463263031500176160ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Database class (see the header for information about the class). // // ================================================================================================= #include #include "utilities/utilities.hpp" #include "database/database.hpp" #include "database/kernels/xaxpy/xaxpy.hpp" #include "database/kernels/xdot/xdot.hpp" #include "database/kernels/xgemv/xgemv.hpp" #include "database/kernels/xgemv_fast/xgemv_fast.hpp" #include "database/kernels/xgemv_fast_rot/xgemv_fast_rot.hpp" #include "database/kernels/xger/xger.hpp" #include "database/kernels/xgemm/xgemm.hpp" #include "database/kernels/xgemm_direct/xgemm_direct.hpp" #include "database/kernels/xconvgemm/xconvgemm.hpp" #include "database/kernels/copy/copy.hpp" #include "database/kernels/pad/pad.hpp" #include "database/kernels/transpose/transpose.hpp" #include "database/kernels/padtranspose/padtranspose.hpp" #include "database/kernels/invert/invert.hpp" #include "database/kernels/gemm_routine/gemm_routine.hpp" #include "database/kernels/trsv_routine/trsv_routine.hpp" #include "database/apple_cpu_fallback.hpp" namespace clblast { // ================================================================================================= std::vector Database::database = std::vector{}; const std::vector Database::apple_cpu_fallback = std::vector{ database::XaxpyApple, database::XdotApple, database::XgemvApple, database::XgemvFastApple, database::XgemvFastRotApple, database::XgerApple, database::XtrsvApple, database::XgemmApple, database::XgemmDirectApple, database::XconvgemmApple, database::CopyApple, database::PadApple, database::TransposeApple, database::PadtransposeApple, database::InvertApple, database::TrsvRoutineApple }; // The default values const std::string Database::kDeviceVendorAll = "default"; // ================================================================================================= // Constructor, computing device properties and populating the parameter-vector from the database. // This takes an optional overlay database in case of custom tuning or custom kernels. Database::Database(const Device &device, const std::string &kernel_name, const Precision precision, const std::vector &overlay): parameters_(std::make_shared()) { // Initializes the static variable on first use. At this point we are sure all global variables are initialized if (database.size() == 0) { database = std::vector{ database::XaxpyHalf, database::XaxpySingle, database::XaxpyDouble, database::XaxpyComplexSingle, database::XaxpyComplexDouble, database::XdotHalf, database::XdotSingle, database::XdotDouble, database::XdotComplexSingle, database::XdotComplexDouble, database::XgemvHalf, database::XgemvSingle, database::XgemvDouble, database::XgemvComplexSingle, database::XgemvComplexDouble, database::XgemvFastHalf, database::XgemvFastSingle, database::XgemvFastDouble, database::XgemvFastComplexSingle, database::XgemvFastComplexDouble, database::XgemvFastRotHalf, database::XgemvFastRotSingle, database::XgemvFastRotDouble, database::XgemvFastRotComplexSingle, database::XgemvFastRotComplexDouble, database::XgerHalf, database::XgerSingle, database::XgerDouble, database::XgerComplexSingle, database::XgerComplexDouble, database::XgemmHalf, database::XgemmSingle, database::XgemmDouble, database::XgemmComplexSingle, database::XgemmComplexDouble, database::XgemmDirectHalf, database::XgemmDirectSingle, database::XgemmDirectDouble, database::XgemmDirectComplexSingle, database::XgemmDirectComplexDouble, database::XconvgemmHalf, database::XconvgemmSingle, database::XconvgemmDouble, database::XconvgemmComplexSingle, database::XconvgemmComplexDouble, database::CopyHalf, database::CopySingle, database::CopyDouble, database::CopyComplexSingle, database::CopyComplexDouble, database::PadHalf, database::PadSingle, database::PadDouble, database::PadComplexSingle, database::PadComplexDouble, database::TransposeHalf, database::TransposeSingle, database::TransposeDouble, database::TransposeComplexSingle, database::TransposeComplexDouble, database::PadtransposeHalf, database::PadtransposeSingle, database::PadtransposeDouble, database::PadtransposeComplexSingle, database::PadtransposeComplexDouble, database::InvertHalf, database::InvertSingle, database::InvertDouble, database::InvertComplexSingle, database::InvertComplexDouble, database::GemmRoutineHalf, database::GemmRoutineSingle, database::GemmRoutineDouble, database::GemmRoutineComplexSingle, database::GemmRoutineComplexDouble, database::TrsvRoutineHalf, database::TrsvRoutineSingle, database::TrsvRoutineDouble, database::TrsvRoutineComplexSingle, database::TrsvRoutineComplexDouble }; } // Finds device information const auto device_type = GetDeviceType(device); const auto device_vendor = GetDeviceVendor(device); const auto device_architecture = GetDeviceArchitecture(device); const auto device_name = GetDeviceName(device); // Prints the obtained information in verbose mode log_debug("Device type '" + device_type + "'; vendor '" + device_vendor + "'"); log_debug("Device name '" + device_name + "'; architecture '" + device_architecture + "'"); // Sets the databases to search through auto databases = std::list>{overlay, database}; // Special case: modifies the database if the device is a CPU with Apple OpenCL #if defined(__APPLE__) || defined(__MACOSX) if (device.Type() == "CPU") { const auto extensions = device.Capabilities(); const auto is_apple = (extensions.find("cl_APPLE_SetMemObjectDestructor") == std::string::npos) ? false : true; const auto is_likely_apple = device.MaxWorkGroupSize() <= 32; if (is_apple || is_likely_apple) { databases.push_front(apple_cpu_fallback); } } #endif // Searches potentially multiple databases auto search_result = database::Parameters(); for (auto &db: databases) { search_result = Search(kernel_name, device_vendor, device_type, device_name, device_architecture, precision, db); if (search_result.size() != 0) { parameters_->insert(search_result.begin(), search_result.end()); break; } } if (search_result.size() == 0) { throw RuntimeErrorCode(StatusCode::kDatabaseError); } } // ================================================================================================= // Returns a list of OpenCL pre-processor defines in string form std::string Database::GetDefines() const { std::string defines{}; for (auto ¶meter: *parameters_) { defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n"; } return defines; } // ... or just the values as string std::string Database::GetValuesString() const { std::string defines{}; for (auto ¶meter: *parameters_) { defines += "_"+ToString(parameter.second); } return defines; } // Retrieves the names of all the parameters std::vector Database::GetParameterNames() const { auto parameter_names = std::vector(); for (auto ¶meter: *parameters_) { parameter_names.push_back(parameter.first); } return parameter_names; } // ================================================================================================= // Searches a particular database for the right kernel and precision database::Parameters Database::Search(const std::string &this_kernel, const std::string &this_vendor, const std::string &this_type, const std::string &this_device, const std::string &this_architecture, const Precision this_precision, const std::vector &this_database) const { // Selects the right kernel for (auto &db: this_database) { if ((db.kernel == this_kernel) && (db.precision == this_precision || db.precision == Precision::kAny)) { // Searches for the right vendor and device type, or selects the default if unavailable const auto parameters = SearchVendorAndType(this_vendor, this_type, this_device, this_architecture, db.vendors, db.parameter_names); if (parameters.size() != 0) { return parameters; } return SearchVendorAndType(kDeviceVendorAll, database::kDeviceTypeAll, this_device, this_architecture, db.vendors, db.parameter_names); } } // If we reached this point, the entry was not found in this database return database::Parameters(); } database::Parameters Database::SearchVendorAndType(const std::string &target_vendor, const std::string &target_type, const std::string &this_device, const std::string &this_architecture, const std::vector &vendors, const std::vector ¶meter_names) const { for (auto &vendor: vendors) { if ((vendor.name == target_vendor) && (vendor.type == target_type)) { log_debug("Found architectures of vendor '" + target_vendor + "' and type '" + target_type + "'"); // Searches the architecture; if unavailable returns the vendor's default parameters auto parameters = SearchArchitecture(this_architecture, this_device, vendor.architectures, parameter_names); if (parameters.size() != 0) { return parameters; } return SearchArchitecture("default", this_device, vendor.architectures, parameter_names); } } return database::Parameters(); } database::Parameters Database::SearchArchitecture(const std::string &target_architecture, const std::string &this_device, const std::vector &architectures, const std::vector ¶meter_names) const { for (auto &architecture: architectures) { if (architecture.name == target_architecture) { log_debug("Found devices of architecture type '" + target_architecture + "'"); // Searches the device; if unavailable returns the architecture's default parameters auto parameters = SearchDevice(this_device, architecture.devices, parameter_names); if (parameters.size() != 0) { return parameters; } return SearchDevice("default", architecture.devices, parameter_names); } } return database::Parameters(); } database::Parameters Database::SearchDevice(const std::string &target_device, const std::vector &devices, const std::vector ¶meter_names) const { for (auto &device: devices) { const auto device_name = CharArrayToString(device.name); // Cuts off 'target_device' string at 50 since the database cuts off as well const auto target_device_cut_off = (target_device.length() > 50) ? target_device.substr(0, 50) : target_device; if (device_name == target_device_cut_off) { log_debug("Found parameters for device type '" + target_device_cut_off + "'"); // Sets the parameters accordingly auto parameters = database::Parameters(); if (parameter_names.size() > device.parameters.size()) { return database::Parameters(); } // ERROR for (auto i = size_t{0}; i < parameter_names.size(); ++i) { parameters[parameter_names[i]] = static_cast(device.parameters[i]); } return parameters; } } return database::Parameters(); } // Helper to convert from database format to proper types std::string Database::CharArrayToString(const database::Name char_array) const { auto result = std::string(char_array.data()); result.erase(result.find_last_not_of(" \t\n\r\f\v") + 1); return result; } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/database/database.hpp000066400000000000000000000122721463263031500176240ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Database class, providing a static variable holding the actual database // information. The class also provides utility functions to search the database and to access a // found entry by parameter-key. The database itself is filled in the corresponding source-file and // partially also by the database/xxxxx.h files, in which kernel-specific parameters are found. // // ================================================================================================= #ifndef CLBLAST_DATABASE_H_ #define CLBLAST_DATABASE_H_ #include #include #include #include "utilities/utilities.hpp" #include "database/database_structure.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class class Database { public: // The OpenCL device vendors static const std::string kDeviceVendorAll; // The database consists of separate database entries, stored together in a vector static std::vector database; // Database for a special case: Apple CPUs support limited number of threads static const std::vector apple_cpu_fallback; Database() = default; // The constructor with a user-provided database overlay (potentially an empty vector) explicit Database(const Device &device, const std::string &kernel_name, const Precision precision, const std::vector &overlay); // Accessor of values by key size_t operator[](const std::string &key) const { return parameters_->find(key)->second; } bool exists(const std::string &key) const { return (parameters_->count(key) == 1); } // Obtain a list of OpenCL pre-processor defines based on the parameters std::string GetDefines() const; // Retrieves the values or names of all the parameters std::string GetValuesString() const; std::vector GetParameterNames() const; const database::Parameters& GetParameters() const { return *parameters_; } private: // Search method functions, returning a set of parameters (possibly empty) database::Parameters Search(const std::string &this_kernel, const std::string &this_vendor, const std::string &this_type, const std::string &this_device, const std::string &this_architecture, const Precision this_precision, const std::vector &db) const; database::Parameters SearchDevice(const std::string &target_device, const std::vector &devices, const std::vector ¶meter_names) const; database::Parameters SearchArchitecture(const std::string &target_architecture, const std::string &this_device, const std::vector &architectures, const std::vector ¶meter_names) const; database::Parameters SearchVendorAndType(const std::string &target_vendor, const std::string &target_type, const std::string &this_device, const std::string &this_architecture, const std::vector &vendors, const std::vector ¶meter_names) const; // Helper to convert from database format to proper types std::string CharArrayToString(const database::Name char_array) const; // Found parameters suitable for this device/kernel std::shared_ptr parameters_; }; // ================================================================================================= // Multiple databases together in a map class Databases { public: explicit Databases(const std::vector &kernel_names): kernel_names_(kernel_names) { } // Database accessor Database& operator()(const std::string &kernel_name) { return databases_[kernel_name]; } // Retrieves a parameter from the database size_t operator[](const std::string &key) const { for (const auto &kernel_name : kernel_names_) { const auto &kernel_db = databases_.find(kernel_name)->second; if (kernel_db.exists(key)) { return kernel_db[key]; } } throw RuntimeErrorCode(StatusCode::kDatabaseError); } private: const std::vector kernel_names_; std::unordered_map databases_; }; // ================================================================================================= } // namespace clblast // CLBLAST_DATABASE_H_ #endif CLBlast-1.6.3/src/database/database_structure.hpp000066400000000000000000000047171463263031500217510ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file describes the database storage structures. // // ================================================================================================= #ifndef CLBLAST_DATABASE_DATABASE_STRUCTURE_H_ #define CLBLAST_DATABASE_DATABASE_STRUCTURE_H_ #include #include #include #include // Just needed for 'Precision' #ifdef OPENCL_API #define CL_TARGET_OPENCL_VERSION 110 #include "clblast.h" #elif CUDA_API #include "clblast_cuda.h" #endif namespace clblast { // A special namespace to hold all the global constant variables (including the database entries) namespace database { // ================================================================================================= // Type alias for the database storage (arrays for fast compilation/efficiency) using Name = std::array; // name as stored in database (50 chars + string terminator) using Params = std::array; // parameters as stored in database // Type alias after extracting from the database (sorted map for improved code readability) using Parameters = std::map; // parameters after reading from DB // The OpenCL device types const std::string kDeviceTypeCPU = "CPU"; const std::string kDeviceTypeGPU = "GPU"; const std::string kDeviceTypeAccelerator = "accelerator"; const std::string kDeviceTypeAll = "default"; const Name kDeviceNameDefault = {"default "}; struct DatabaseDevice { Name name; Params parameters; // parameter values }; struct DatabaseArchitecture { std::string name; std::vector devices; }; struct DatabaseVendor { std::string type; std::string name; std::vector architectures; }; struct DatabaseEntry { std::string kernel; Precision precision; std::vector parameter_names; std::vector vendors; }; // ================================================================================================= } // namespace database } // namespace clblast // CLBLAST_DATABASE_DATABASE_STRUCTURE_H_ #endif CLBlast-1.6.3/src/database/kernels/000077500000000000000000000000001463263031500170065ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/copy/000077500000000000000000000000001463263031500177605ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/copy/copy.cpp000066400000000000000000000013531463263031500214400ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Copy' kernels. // // ================================================================================================= #include "database/kernels/copy/copy.hpp" #include "database/kernels/copy/copy_16.hpp" #include "database/kernels/copy/copy_32.hpp" #include "database/kernels/copy/copy_3232.hpp" #include "database/kernels/copy/copy_64.hpp" #include "database/kernels/copy/copy_6464.hpp" CLBlast-1.6.3/src/database/kernels/copy/copy.hpp000066400000000000000000000014611463263031500214450ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Copy' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry CopyHalf; extern const DatabaseEntry CopySingle; extern const DatabaseEntry CopyComplexSingle; extern const DatabaseEntry CopyDouble; extern const DatabaseEntry CopyComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/copy/copy_16.hpp000066400000000000000000000241601463263031500217540ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Copy16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry CopyHalf = { "Copy", Precision::kHalf, {"COPY_DIMX", "COPY_DIMY", "COPY_VW", "COPY_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 8, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 16, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 8, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 16, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 8, 16, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 8, 32, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 16, 8, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 16, 8, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 8, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 16, 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 16, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 8, 32, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 8, 8, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 16, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 16, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 16, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 16, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/copy/copy_32.hpp000066400000000000000000000653711463263031500217630ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Copy32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry CopySingle = { "Copy", Precision::kSingle, {"COPY_DIMX", "COPY_DIMY", "COPY_VW", "COPY_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 8, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 16, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 8, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 8, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 8, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 8, 16, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 32, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 32, 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 32, 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 32, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 32, 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 32, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 32, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Gen9 HD Graphics NEO "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 8, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 8, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 32, 8, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GT 650M "}, Params{ 16, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 16, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 16, 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 8, 32, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 8, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 16, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 16, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 32, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 16, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 8, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 16, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 8, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 8, 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 8, 8, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 8, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/copy/copy_3232.hpp000066400000000000000000000625551463263031500221310ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Copy3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry CopyComplexSingle = { "Copy", Precision::kComplexSingle, {"COPY_DIMX", "COPY_DIMY", "COPY_VW", "COPY_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 8, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 16, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 32, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 16, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 32, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 16, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 16, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 32, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 16, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 32, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 16, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 8, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 32, 16, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 8, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 32, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 32, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 16, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 32, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 32, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 16, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 8, 8, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 32, 32, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 32, 32, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 8, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 16, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/copy/copy_64.hpp000066400000000000000000000534101463263031500217570ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Copy64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry CopyDouble = { "Copy", Precision::kDouble, {"COPY_DIMX", "COPY_DIMY", "COPY_VW", "COPY_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 32, 8, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 8, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 32, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 8, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 8, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 16, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 16, 16, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 32, 8, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 16, 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 32, 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 16, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 32, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 16, 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 16, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 8, 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 32, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 16, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 32, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 16, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 16, 8, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 16, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 32, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 32, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 32, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 16, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 16, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 32, 32, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 32, 32, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 32, 32, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 8, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 8, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/copy/copy_6464.hpp000066400000000000000000000534031463263031500221330ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Copy6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry CopyComplexDouble = { "Copy", Precision::kComplexDouble, {"COPY_DIMX", "COPY_DIMY", "COPY_VW", "COPY_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 32, 8, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 32, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 8, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 32, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 16, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 32, 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 32, 16, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 8, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 32, 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 32, 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 8, 16, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 16, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 8, 32, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 32, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 8, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 16, 32, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 8, 16, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 32, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 32, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 32, 32, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 32, 32, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 8, 16, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 16, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/gemm_routine/000077500000000000000000000000001463263031500215005ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/gemm_routine/gemm_routine.cpp000066400000000000000000000015231463263031500246770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Gemm_Routine' kernels. // // ================================================================================================= #include "database/kernels/gemm_routine/gemm_routine.hpp" #include "database/kernels/gemm_routine/gemm_routine_16.hpp" #include "database/kernels/gemm_routine/gemm_routine_32.hpp" #include "database/kernels/gemm_routine/gemm_routine_3232.hpp" #include "database/kernels/gemm_routine/gemm_routine_64.hpp" #include "database/kernels/gemm_routine/gemm_routine_6464.hpp" CLBlast-1.6.3/src/database/kernels/gemm_routine/gemm_routine.hpp000066400000000000000000000015341463263031500247060ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Gemm_Routine' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry GemmRoutineHalf; extern const DatabaseEntry GemmRoutineSingle; extern const DatabaseEntry GemmRoutineComplexSingle; extern const DatabaseEntry GemmRoutineDouble; extern const DatabaseEntry GemmRoutineComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/gemm_routine/gemm_routine_16.hpp000066400000000000000000000227601463263031500252200ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Gemm_Routine16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry GemmRoutineHalf = { "GemmRoutine", Precision::kHalf, {"XGEMM_MIN_INDIRECT_SIZE"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T628 "}, Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/gemm_routine/gemm_routine_32.hpp000066400000000000000000000471671463263031500252260ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Gemm_Routine32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry GemmRoutineSingle = { "GemmRoutine", Precision::kSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 1152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1216, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce GTX 750 Ti "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 1472, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1792, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 1664, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1408, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 1152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1664, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 1216, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1920, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 1600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 1600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1664, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/gemm_routine/gemm_routine_3232.hpp000066400000000000000000000437231463263031500253650ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Gemm_Routine3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry GemmRoutineComplexSingle = { "GemmRoutine", Precision::kComplexSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 1408, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 1216, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1664, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce GTX 750 Ti "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1408, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 1472, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 1088, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1088, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1408, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1856, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1472, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 1664, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1408, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 1216, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 1792, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1088, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/gemm_routine/gemm_routine_64.hpp000066400000000000000000000361061463263031500252220ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Gemm_Routine64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry GemmRoutineDouble = { "GemmRoutine", Precision::kDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1088, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 1600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) UHD Graphics 620 "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce GTX 750 Ti "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1856, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1920, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/gemm_routine/gemm_routine_6464.hpp000066400000000000000000000363371463263031500254020ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Gemm_Routine6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry GemmRoutineComplexDouble = { "GemmRoutine", Precision::kComplexDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 1152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1216, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1088, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 1408, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1920, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 1216, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) UHD Graphics 620 "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce GTX 750 Ti "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 448, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 1152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 1472, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 1472, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1472, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1664, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 1088, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1472, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1088, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/invert/000077500000000000000000000000001463263031500203155ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/invert/invert.cpp000066400000000000000000000014051463263031500223300ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Invert' kernels. // // ================================================================================================= #include "database/kernels/invert/invert.hpp" #include "database/kernels/invert/invert_16.hpp" #include "database/kernels/invert/invert_32.hpp" #include "database/kernels/invert/invert_3232.hpp" #include "database/kernels/invert/invert_64.hpp" #include "database/kernels/invert/invert_6464.hpp" CLBlast-1.6.3/src/database/kernels/invert/invert.hpp000066400000000000000000000014751463263031500223440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Invert' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry InvertHalf; extern const DatabaseEntry InvertSingle; extern const DatabaseEntry InvertComplexSingle; extern const DatabaseEntry InvertDouble; extern const DatabaseEntry InvertComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/invert/invert_16.hpp000066400000000000000000000217771463263031500226610ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Invert16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry InvertHalf = { "Invert", Precision::kHalf, {"INTERNAL_BLOCK_SIZE", "LOCALPAD", "TMMWGSX", "TMMWGSY"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/invert/invert_32.hpp000066400000000000000000000437471463263031500226600ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Invert32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry InvertSingle = { "Invert", Precision::kSingle, {"INTERNAL_BLOCK_SIZE", "LOCALPAD", "TMMWGSX", "TMMWGSY"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/invert/invert_3232.hpp000066400000000000000000000423561463263031500230200ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Invert3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry InvertComplexSingle = { "Invert", Precision::kComplexSingle, {"INTERNAL_BLOCK_SIZE", "LOCALPAD", "TMMWGSX", "TMMWGSY"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/invert/invert_64.hpp000066400000000000000000000352621463263031500226560ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Invert64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry InvertDouble = { "Invert", Precision::kDouble, {"INTERNAL_BLOCK_SIZE", "LOCALPAD", "TMMWGSX", "TMMWGSY"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) UHD Graphics 620 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/invert/invert_6464.hpp000066400000000000000000000357551463263031500230370ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Invert6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry InvertComplexDouble = { "Invert", Precision::kComplexDouble, {"INTERNAL_BLOCK_SIZE", "LOCALPAD", "TMMWGSX", "TMMWGSY"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) UHD Graphics 620 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/pad/000077500000000000000000000000001463263031500175525ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/pad/pad.cpp000066400000000000000000000013361463263031500210250ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Pad' kernels. // // ================================================================================================= #include "database/kernels/pad/pad.hpp" #include "database/kernels/pad/pad_16.hpp" #include "database/kernels/pad/pad_32.hpp" #include "database/kernels/pad/pad_3232.hpp" #include "database/kernels/pad/pad_64.hpp" #include "database/kernels/pad/pad_6464.hpp" CLBlast-1.6.3/src/database/kernels/pad/pad.hpp000066400000000000000000000014531463263031500210320ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Pad' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry PadHalf; extern const DatabaseEntry PadSingle; extern const DatabaseEntry PadComplexSingle; extern const DatabaseEntry PadDouble; extern const DatabaseEntry PadComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/pad/pad_16.hpp000066400000000000000000000236611463263031500213450ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Pad16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry PadHalf = { "Pad", Precision::kHalf, {"PAD_DIMX", "PAD_DIMY", "PAD_WPTX", "PAD_WPTY"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 16, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 16, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T628 "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 32, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 16, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/pad/pad_32.hpp000066400000000000000000000652421463263031500213440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Pad32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry PadSingle = { "Pad", Precision::kSingle, {"PAD_DIMX", "PAD_DIMY", "PAD_WPTX", "PAD_WPTY"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 16, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 16, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 8, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 8, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 32, 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 32, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 16, 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 32, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 32, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 32, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GT 650M "}, Params{ 32, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 32, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 16, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 16, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 16, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 32, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 16, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 8, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 8, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 16, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 32, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 16, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/pad/pad_3232.hpp000066400000000000000000000625121463263031500215060ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Pad3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry PadComplexSingle = { "Pad", Precision::kComplexSingle, {"PAD_DIMX", "PAD_DIMY", "PAD_WPTX", "PAD_WPTY"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 16, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 8, 32, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 16, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 16, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 8, 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 32, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 32, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 32, 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 32, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 32, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 32, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 32, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 32, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 16, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 32, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 32, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 8, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 32, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 8, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 8, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 16, 32, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/pad/pad_64.hpp000066400000000000000000000534141463263031500213470ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Pad64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry PadDouble = { "Pad", Precision::kDouble, {"PAD_DIMX", "PAD_DIMY", "PAD_WPTX", "PAD_WPTY"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 8, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 8, 32, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 8, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 32, 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 32, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 32, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 32, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 32, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 16, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 16, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 32, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 32, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 32, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 32, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 16, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/pad/pad_6464.hpp000066400000000000000000000533701463263031500215220ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Pad6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry PadComplexDouble = { "Pad", Precision::kComplexDouble, {"PAD_DIMX", "PAD_DIMY", "PAD_WPTX", "PAD_WPTY"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 16, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 16, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 16, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 32, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 16, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 16, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 8, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 16, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 16, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 8, 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 8, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/padtranspose/000077500000000000000000000000001463263031500215115ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/padtranspose/padtranspose.cpp000066400000000000000000000015231463263031500247210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Padtranspose' kernels. // // ================================================================================================= #include "database/kernels/padtranspose/padtranspose.hpp" #include "database/kernels/padtranspose/padtranspose_16.hpp" #include "database/kernels/padtranspose/padtranspose_32.hpp" #include "database/kernels/padtranspose/padtranspose_3232.hpp" #include "database/kernels/padtranspose/padtranspose_64.hpp" #include "database/kernels/padtranspose/padtranspose_6464.hpp" CLBlast-1.6.3/src/database/kernels/padtranspose/padtranspose.hpp000066400000000000000000000015411463263031500247260ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Padtranspose' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry PadtransposeHalf; extern const DatabaseEntry PadtransposeSingle; extern const DatabaseEntry PadtransposeComplexSingle; extern const DatabaseEntry PadtransposeDouble; extern const DatabaseEntry PadtransposeComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/padtranspose/padtranspose_16.hpp000066400000000000000000000243571463263031500252460ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Padtranspose16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry PadtransposeHalf = { "Padtranspose", Precision::kHalf, {"PADTRA_PAD", "PADTRA_TILE", "PADTRA_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 0, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T628 "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/padtranspose/padtranspose_32.hpp000066400000000000000000000647531463263031500252500ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Padtranspose32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry PadtransposeSingle = { "Padtranspose", Precision::kSingle, {"PADTRA_PAD", "PADTRA_TILE", "PADTRA_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 0, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 0, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 0, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/padtranspose/padtranspose_3232.hpp000066400000000000000000000631101463263031500253770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Padtranspose3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry PadtransposeComplexSingle = { "Padtranspose", Precision::kComplexSingle, {"PADTRA_PAD", "PADTRA_TILE", "PADTRA_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 0, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 0, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/padtranspose/padtranspose_64.hpp000066400000000000000000000533431463263031500252460ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Padtranspose64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry PadtransposeDouble = { "Padtranspose", Precision::kDouble, {"PADTRA_PAD", "PADTRA_TILE", "PADTRA_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 0, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/padtranspose/padtranspose_6464.hpp000066400000000000000000000533301463263031500254140ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Padtranspose6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry PadtransposeComplexDouble = { "Padtranspose", Precision::kComplexDouble, {"PADTRA_PAD", "PADTRA_TILE", "PADTRA_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 0, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/transpose/000077500000000000000000000000001463263031500210245ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/transpose/transpose.cpp000066400000000000000000000014541463263031500235520ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Transpose' kernels. // // ================================================================================================= #include "database/kernels/transpose/transpose.hpp" #include "database/kernels/transpose/transpose_16.hpp" #include "database/kernels/transpose/transpose_32.hpp" #include "database/kernels/transpose/transpose_3232.hpp" #include "database/kernels/transpose/transpose_64.hpp" #include "database/kernels/transpose/transpose_6464.hpp" CLBlast-1.6.3/src/database/kernels/transpose/transpose.hpp000066400000000000000000000015171463263031500235570ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Transpose' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry TransposeHalf; extern const DatabaseEntry TransposeSingle; extern const DatabaseEntry TransposeComplexSingle; extern const DatabaseEntry TransposeDouble; extern const DatabaseEntry TransposeComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/transpose/transpose_16.hpp000066400000000000000000000231361463263031500240660ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Transpose16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry TransposeHalf = { "Transpose", Precision::kHalf, {"TRA_DIM", "TRA_PAD", "TRA_SHUFFLE", "TRA_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 8, 1, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 8, 1, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 16, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 16, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 4, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 8, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 4, 1, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T628 "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 32, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 32, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 1, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 8, 1, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 16, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/transpose/transpose_32.hpp000066400000000000000000000641371463263031500240720ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Transpose32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry TransposeSingle = { "Transpose", Precision::kSingle, {"TRA_DIM", "TRA_PAD", "TRA_SHUFFLE", "TRA_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 16, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 16, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 16, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 16, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 4, 1, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 8, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 4, 1, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 8, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 8, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 8, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 8, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 32, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GT 650M "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 8, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 4, 0, 1, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 32, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 4, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 16, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 32, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 16, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 16, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 16, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/transpose/transpose_3232.hpp000066400000000000000000000626571463263031500242440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Transpose3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry TransposeComplexSingle = { "Transpose", Precision::kComplexSingle, {"TRA_DIM", "TRA_PAD", "TRA_SHUFFLE", "TRA_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 4, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 8, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 8, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 16, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 64, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 32, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 16, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 16, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 32, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 16, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 4, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/transpose/transpose_64.hpp000066400000000000000000000532721463263031500240750ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Transpose64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry TransposeDouble = { "Transpose", Precision::kDouble, {"TRA_DIM", "TRA_PAD", "TRA_SHUFFLE", "TRA_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 4, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 4, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 4, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 4, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 8, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 32, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 32, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 32, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 16, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 16, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 16, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 16, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/transpose/transpose_6464.hpp000066400000000000000000000523121463263031500242410ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Transpose6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry TransposeComplexDouble = { "Transpose", Precision::kComplexDouble, {"TRA_DIM", "TRA_PAD", "TRA_SHUFFLE", "TRA_WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 4, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 4, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 16, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 4, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 32, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 16, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 32, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 4, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 16, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/trsv_routine/000077500000000000000000000000001463263031500215515ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/trsv_routine/trsv_routine.cpp000066400000000000000000000015231463263031500250210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Trsv_Routine' kernels. // // ================================================================================================= #include "database/kernels/trsv_routine/trsv_routine.hpp" #include "database/kernels/trsv_routine/trsv_routine_16.hpp" #include "database/kernels/trsv_routine/trsv_routine_32.hpp" #include "database/kernels/trsv_routine/trsv_routine_3232.hpp" #include "database/kernels/trsv_routine/trsv_routine_64.hpp" #include "database/kernels/trsv_routine/trsv_routine_6464.hpp" CLBlast-1.6.3/src/database/kernels/trsv_routine/trsv_routine.hpp000066400000000000000000000015341463263031500250300ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Trsv_Routine' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry TrsvRoutineHalf; extern const DatabaseEntry TrsvRoutineSingle; extern const DatabaseEntry TrsvRoutineComplexSingle; extern const DatabaseEntry TrsvRoutineDouble; extern const DatabaseEntry TrsvRoutineComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/trsv_routine/trsv_routine_16.hpp000066400000000000000000000016251463263031500253370ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Trsv_Routine16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry TrsvRoutineHalf = { "TrsvRoutine", Precision::kHalf, {"TRSV_BLOCK_SIZE"}, { { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/trsv_routine/trsv_routine_32.hpp000066400000000000000000000452001463263031500253320ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Trsv_Routine32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry TrsvRoutineSingle = { "TrsvRoutine", Precision::kSingle, {"TRSV_BLOCK_SIZE"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"Quadro M2000M "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/trsv_routine/trsv_routine_3232.hpp000066400000000000000000000430311463263031500254770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Trsv_Routine3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry TrsvRoutineComplexSingle = { "TrsvRoutine", Precision::kComplexSingle, {"TRSV_BLOCK_SIZE"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"Quadro M2000M "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/trsv_routine/trsv_routine_64.hpp000066400000000000000000000352331463263031500253440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Trsv_Routine64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry TrsvRoutineDouble = { "TrsvRoutine", Precision::kDouble, {"TRSV_BLOCK_SIZE"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) UHD Graphics 620 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"Quadro M2000M "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/trsv_routine/trsv_routine_6464.hpp000066400000000000000000000352531463263031500255200ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Trsv_Routine6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry TrsvRoutineComplexDouble = { "TrsvRoutine", Precision::kComplexDouble, {"TRSV_BLOCK_SIZE"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) UHD Graphics 620 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"Quadro M2000M "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xaxpy/000077500000000000000000000000001463263031500201575ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/xaxpy/xaxpy.cpp000066400000000000000000000013701463263031500220350ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xaxpy' kernels. // // ================================================================================================= #include "database/kernels/xaxpy/xaxpy.hpp" #include "database/kernels/xaxpy/xaxpy_16.hpp" #include "database/kernels/xaxpy/xaxpy_32.hpp" #include "database/kernels/xaxpy/xaxpy_3232.hpp" #include "database/kernels/xaxpy/xaxpy_64.hpp" #include "database/kernels/xaxpy/xaxpy_6464.hpp" CLBlast-1.6.3/src/database/kernels/xaxpy/xaxpy.hpp000066400000000000000000000014671463263031500220510ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xaxpy' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry XaxpyHalf; extern const DatabaseEntry XaxpySingle; extern const DatabaseEntry XaxpyComplexSingle; extern const DatabaseEntry XaxpyDouble; extern const DatabaseEntry XaxpyComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xaxpy/xaxpy_16.hpp000066400000000000000000000244011463263031500223500ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xaxpy16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XaxpyHalf = { "Xaxpy", Precision::kHalf, {"VW", "WGS", "WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 2, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 2, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 4, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 4, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 4, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 8, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 4, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T628 "}, Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 4, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 2, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xaxpy/xaxpy_32.hpp000066400000000000000000000653561463263031500223640ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xaxpy32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XaxpySingle = { "Xaxpy", Precision::kSingle, {"VW", "WGS", "WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 8, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 2, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 4, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 2, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 4, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 8, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 8, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 2, 2048, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 4, 2048, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 2, 2048, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 8, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 1, 2048, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1, 2048, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 8, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 8, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 2, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 2048, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GT 650M "}, Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 1, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 4, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 4, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 2, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 2, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 8, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 1, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 4, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 4, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 4, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 4, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 4, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 512, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 512, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xaxpy/xaxpy_3232.hpp000066400000000000000000000633551463263031500225260ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xaxpy3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XaxpyComplexSingle = { "Xaxpy", Precision::kComplexSingle, {"VW", "WGS", "WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 2, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 8, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 2, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 8, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 1, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 8, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1, 2048, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 1, 1024, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 512, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 8, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1, 1024, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 2, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xaxpy/xaxpy_64.hpp000066400000000000000000000535311463263031500223610ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xaxpy64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XaxpyDouble = { "Xaxpy", Precision::kDouble, {"VW", "WGS", "WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 2, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 2, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 2, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 1, 2048, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 8, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 8, 2048, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 8, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 2, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 8, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 4, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 4, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 1, 1024, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 2, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 512, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 8, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 8, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 2, 512, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 2, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xaxpy/xaxpy_6464.hpp000066400000000000000000000535711463263031500225370ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xaxpy6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XaxpyComplexDouble = { "Xaxpy", Precision::kComplexDouble, {"VW", "WGS", "WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 8, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 8, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 8, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 2, 2048, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 1, 1024, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 512, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 1, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 4, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 4, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 4, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xconvgemm/000077500000000000000000000000001463263031500210115ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/xconvgemm/xconvgemm.cpp000066400000000000000000000014541463263031500235240ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xconvgemm' kernels. // // ================================================================================================= #include "database/kernels/xconvgemm/xconvgemm.hpp" #include "database/kernels/xconvgemm/xconvgemm_16.hpp" #include "database/kernels/xconvgemm/xconvgemm_32.hpp" #include "database/kernels/xconvgemm/xconvgemm_3232.hpp" #include "database/kernels/xconvgemm/xconvgemm_64.hpp" #include "database/kernels/xconvgemm/xconvgemm_6464.hpp" CLBlast-1.6.3/src/database/kernels/xconvgemm/xconvgemm.hpp000066400000000000000000000015171463263031500235310ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xconvgemm' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry XconvgemmHalf; extern const DatabaseEntry XconvgemmSingle; extern const DatabaseEntry XconvgemmComplexSingle; extern const DatabaseEntry XconvgemmDouble; extern const DatabaseEntry XconvgemmComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xconvgemm/xconvgemm_16.hpp000066400000000000000000000107061463263031500240370ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xconvgemm16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XconvgemmHalf = { "Xconvgemm", Precision::kHalf, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "default", { { kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 8, 16, 16, 8, 0, 0, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 16, 16, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 16, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 8, 16, 16, 8, 0, 0, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 8, 16, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1, 8, 16, 16, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 8, 16, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 1, 8, 16, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 8, 16, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 1, 8, 8, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 8, 16, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 8, 8, 8, 16, 0, 0, 4, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 8, 16, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 8, 8, 8, 16, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xconvgemm/xconvgemm_32.hpp000066400000000000000000000233611463263031500240360ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xconvgemm32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XconvgemmSingle = { "Xconvgemm", Precision::kSingle, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 1, 16, 8, 16, 32, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 8, 16, 32, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 1, 8, 16, 16, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1, 8, 8, 8, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 8, 8, 16, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 16, 8, 0, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 8, 16, 32, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 32, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 8, 8, 16, 8, 0, 0, 4, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 16, 8, 0, 0, 4, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 8, 8, 8, 8, 0, 0, 4, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 8, 8, 0, 0, 4, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 8, 8, 8, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 8, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1, 8, 8, 16, 8, 0, 0, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 16, 8, 0, 0, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 8, 8, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 1, 8, 8, 16, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 16, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 16, 16, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 16, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 8, 8, 16, 8, 0, 0, 4, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 16, 8, 0, 0, 4, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 1, 8, 8, 8, 16, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 8, 16, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 1, 16, 16, 16, 16, 0, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 16, 16, 16, 0, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1, 16, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Gen9 HD Graphics NEO "}, Params{ 1, 16, 32, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 16, 8, 8, 16, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 1, 16, 8, 16, 8, 0, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 8, 8, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 1, 8, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM3.5", { { Name{"GeForce 920A "}, Params{ 1, 8, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"Quadro M2000M "}, Params{ 1, 16, 16, 32, 16, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 16, 32, 16, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"Tesla P4 "}, Params{ 1, 16, 16, 32, 16, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 16, 32, 16, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1, 8, 32, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1, 8, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 32, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 1, 32, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 1, 32, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1, 8, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 1, 8, 32, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 32, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 8, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 8, 16, 8, 8, 0, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xconvgemm/xconvgemm_3232.hpp000066400000000000000000000017471463263031500242070ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xconvgemm3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XconvgemmComplexSingle = { "Xconvgemm", Precision::kComplexSingle, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 8, 16, 8, 8, 0, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xconvgemm/xconvgemm_64.hpp000066400000000000000000000153341463263031500240440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xconvgemm64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XconvgemmDouble = { "Xconvgemm", Precision::kDouble, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 1, 8, 32, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 8, 16, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 8, 16, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 16, 8, 8, 16, 0, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 8, 8, 16, 0, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 8, 8, 16, 16, 0, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 8, 16, 16, 0, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 8, 16, 8, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 8, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1, 8, 16, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 8, 16, 8, 16, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 8, 16, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 16, 32, 8, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 32, 8, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1, 16, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Gen9 HD Graphics NEO "}, Params{ 1, 8, 16, 16, 8, 0, 0, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 8, 8, 32, 8, 0, 0, 4, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM5.0", { { Name{"Quadro M2000M "}, Params{ 1, 16, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"Tesla P4 "}, Params{ 1, 16, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1, 32, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1, 32, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 16, 8, 16, 8, 0, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 1, 8, 16, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 1, 8, 16, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1, 32, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 1, 8, 16, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 16, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 32, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 16, 16, 32, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xconvgemm/xconvgemm_6464.hpp000066400000000000000000000017471463263031500242210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xconvgemm6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XconvgemmComplexDouble = { "Xconvgemm", Precision::kComplexDouble, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 8, 16, 8, 8, 0, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xdot/000077500000000000000000000000001463263031500177645ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/xdot/xdot.cpp000066400000000000000000000013531463263031500214500ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xdot' kernels. // // ================================================================================================= #include "database/kernels/xdot/xdot.hpp" #include "database/kernels/xdot/xdot_16.hpp" #include "database/kernels/xdot/xdot_32.hpp" #include "database/kernels/xdot/xdot_3232.hpp" #include "database/kernels/xdot/xdot_64.hpp" #include "database/kernels/xdot/xdot_6464.hpp" CLBlast-1.6.3/src/database/kernels/xdot/xdot.hpp000066400000000000000000000014611463263031500214550ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xdot' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry XdotHalf; extern const DatabaseEntry XdotSingle; extern const DatabaseEntry XdotComplexSingle; extern const DatabaseEntry XdotDouble; extern const DatabaseEntry XdotComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xdot/xdot_16.hpp000066400000000000000000000245261463263031500217720ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xdot16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XdotHalf = { "Xdot", Precision::kHalf, {"WGS1", "WGS2"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 512, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T628 "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 64, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xdot/xdot_32.hpp000066400000000000000000000636741463263031500217770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xdot32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XdotSingle = { "Xdot", Precision::kSingle, {"WGS1", "WGS2"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 XT "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 1024, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1024, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 1024, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1024, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1024, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 512, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 1024, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GT 650M "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 512, 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 128, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 128, 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1024, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 32, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 256, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 32, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1024, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 512, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 64, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xdot/xdot_3232.hpp000066400000000000000000000620501463263031500221270ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xdot3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XdotComplexSingle = { "Xdot", Precision::kComplexSingle, {"WGS1", "WGS2"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 32, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 1024, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1024, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 512, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1024, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 64, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 1024, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 1024, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1024, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 64, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 64, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 128, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1024, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 512, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1024, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xdot/xdot_64.hpp000066400000000000000000000517711463263031500217770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xdot64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XdotDouble = { "Xdot", Precision::kDouble, {"WGS1", "WGS2"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 32, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 64, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 1024, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1024, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 1024, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 1024, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1024, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 256, 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 64, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 128, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 256, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 64, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xdot/xdot_6464.hpp000066400000000000000000000517621463263031500221510ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xdot6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XdotComplexDouble = { "Xdot", Precision::kComplexDouble, {"WGS1", "WGS2"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 32, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 512, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 256, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 64, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 32, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1024, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 512, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemm/000077500000000000000000000000001463263031500201235ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/xgemm/xgemm.cpp000066400000000000000000000013701463263031500217450ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm' kernels. // // ================================================================================================= #include "database/kernels/xgemm/xgemm.hpp" #include "database/kernels/xgemm/xgemm_16.hpp" #include "database/kernels/xgemm/xgemm_32.hpp" #include "database/kernels/xgemm/xgemm_3232.hpp" #include "database/kernels/xgemm/xgemm_64.hpp" #include "database/kernels/xgemm/xgemm_6464.hpp" CLBlast-1.6.3/src/database/kernels/xgemm/xgemm.hpp000066400000000000000000000014671463263031500217610ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry XgemmHalf; extern const DatabaseEntry XgemmSingle; extern const DatabaseEntry XgemmComplexSingle; extern const DatabaseEntry XgemmDouble; extern const DatabaseEntry XgemmComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemm/xgemm_16.hpp000066400000000000000000000233501463263031500222620ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemmHalf = { "Xgemm", Precision::kHalf, {"GEMMK", "KREG", "KWG", "KWI", "MDIMA", "MDIMC", "MWG", "NDIMB", "NDIMC", "NWG", "SA", "SB", "STRM", "STRN", "VWM", "VWN"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 0, 1, 16, 2, 16, 16, 128, 16, 16, 128, 1, 1, 1, 1, 8, 1 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 0, 1, 16, 2, 16, 16, 128, 16, 16, 128, 1, 1, 1, 1, 8, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 1 } }, } }, { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 0, 1, 16, 2, 16, 8, 128, 16, 32, 128, 1, 1, 1, 0, 8, 4 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 0, 1, 32, 2, 8, 16, 128, 16, 16, 128, 1, 1, 1, 0, 8, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 2, 4 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 0, 1, 32, 2, 8, 16, 128, 16, 16, 128, 1, 1, 1, 0, 8, 8 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 1, 1, 0, 0, 4, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 0, 1, 32, 2, 8, 16, 128, 16, 16, 128, 1, 1, 1, 0, 8, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 16, 128, 16, 16, 128, 1, 1, 1, 0, 8, 8 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 0, 1, 32, 2, 16, 16, 128, 32, 8, 128, 1, 1, 0, 0, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 128, 32, 8, 128, 1, 1, 0, 0, 2, 4 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 0, 1, 16, 2, 8, 8, 64, 8, 8, 128, 1, 1, 1, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 64, 8, 8, 128, 1, 1, 1, 0, 2, 1 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 0, 1, 32, 2, 16, 8, 64, 16, 8, 128, 1, 1, 0, 1, 4, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 8, 64, 16, 8, 128, 1, 1, 0, 1, 4, 8 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 0, 1, 32, 2, 8, 8, 128, 8, 16, 128, 1, 1, 1, 0, 8, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 128, 8, 16, 128, 1, 1, 1, 0, 8, 1 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 0, 1, 32, 2, 8, 8, 128, 16, 32, 128, 1, 1, 1, 1, 4, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 64, 1, 1, 0, 0, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 64, 1, 1, 0, 0, 2, 4 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 1, 16, 2, 8, 8, 64, 8, 16, 128, 1, 1, 0, 1, 8, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 64, 8, 16, 128, 1, 1, 0, 1, 8, 2 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 0, 1, 32, 2, 8, 16, 128, 8, 8, 32, 0, 1, 0, 1, 8, 4 } }, { Name{"Mali-T760 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 0, 0, 0, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 0, 0, 0, 0, 1, 2 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 2, 4 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 1, 1 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 0, 1, 16, 2, 8, 8, 32, 16, 16, 128, 0, 1, 1, 0, 4, 8 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 0, 1, 16, 2, 16, 16, 64, 16, 8, 128, 1, 1, 0, 1, 1, 8 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 0, 1, 16, 2, 16, 16, 64, 16, 8, 128, 1, 1, 0, 1, 1, 8 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 1, 1 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 0, 1, 32, 2, 8, 16, 64, 8, 8, 128, 1, 1, 0, 1, 1, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 1, 1, 0, 0, 2, 2 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 1, 32, 2, 32, 32, 128, 8, 8, 128, 0, 0, 0, 1, 2, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 128, 8, 8, 128, 0, 0, 0, 1, 2, 8 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 1, 16, 2, 16, 16, 64, 8, 8, 128, 1, 0, 1, 1, 2, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 16, 64, 8, 8, 128, 1, 0, 1, 1, 2, 8 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 32, 0, 0, 0, 0, 4, 4 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemm/xgemm_32.hpp000066400000000000000000000670371463263031500222720ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemmSingle = { "Xgemm", Precision::kSingle, {"GEMMK", "KREG", "KWG", "KWI", "MDIMA", "MDIMC", "MWG", "NDIMB", "NDIMC", "NWG", "SA", "SB", "STRM", "STRN", "VWM", "VWN"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 64, 1, 1, 0, 0, 1, 2 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 0, 1, 16, 2, 32, 32, 128, 16, 8, 128, 1, 1, 1, 1, 2, 4 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 0, 1, 32, 2, 16, 16, 64, 32, 16, 128, 1, 1, 0, 1, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 0, 1, 32, 2, 8, 16, 128, 16, 16, 128, 1, 1, 1, 0, 8, 8 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 16, 128, 0, 0, 0, 0, 2, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 2, 4 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"AMD Radeon R9 290X "}, Params{ 0, 1, 16, 2, 16, 32, 128, 32, 8, 64, 1, 1, 1, 1, 4, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 16, 64, 16, 8, 64, 1, 1, 0, 0, 4, 2 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 0, 1, 16, 2, 32, 16, 64, 32, 16, 128, 1, 1, 1, 0, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 32, 16, 64, 32, 16, 128, 1, 1, 1, 0, 2, 4 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 0, 1, 16, 2, 16, 8, 32, 16, 16, 128, 0, 0, 1, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 8, 32, 16, 16, 128, 0, 0, 1, 0, 1, 1 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 0, 1, 32, 2, 16, 32, 128, 16, 8, 64, 0, 0, 0, 0, 4, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 32, 128, 16, 8, 64, 0, 0, 0, 0, 4, 1 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 0, 1, 16, 2, 16, 32, 64, 16, 8, 128, 1, 1, 0, 0, 2, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 32, 64, 16, 8, 128, 1, 1, 0, 0, 2, 8 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 0, 1, 32, 2, 8, 16, 128, 8, 8, 128, 0, 0, 1, 1, 8, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 16, 128, 8, 8, 128, 0, 0, 1, 1, 8, 8 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 2, 4 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 0, 1, 32, 2, 16, 16, 128, 16, 8, 64, 1, 1, 1, 0, 4, 1 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 0, 1, 16, 2, 16, 8, 64, 8, 8, 64, 1, 1, 0, 1, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 0, 0, 0, 0, 4, 4 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 0, 1, 16, 2, 8, 8, 128, 32, 8, 64, 1, 0, 1, 1, 4, 1 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 0, 1, 32, 2, 16, 16, 128, 16, 8, 64, 1, 1, 1, 0, 4, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 0, 1, 32, 2, 8, 8, 128, 32, 16, 128, 1, 1, 1, 1, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 128, 32, 16, 128, 1, 1, 1, 1, 4, 4 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 0, 1, 32, 2, 8, 8, 128, 32, 16, 128, 1, 1, 1, 1, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 128, 32, 16, 128, 1, 1, 1, 1, 4, 4 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 0, 1, 32, 2, 16, 16, 128, 16, 8, 64, 1, 1, 1, 0, 4, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 128, 16, 8, 64, 1, 1, 1, 0, 4, 1 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 0, 1, 16, 2, 8, 16, 32, 32, 8, 128, 1, 1, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 16, 32, 32, 8, 128, 1, 1, 0, 0, 2, 1 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 32, 0, 1, 1, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 32, 0, 1, 1, 0, 4, 4 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 0, 1, 32, 2, 16, 16, 64, 32, 8, 128, 0, 1, 1, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 32, 8, 128, 0, 1, 1, 0, 1, 2 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 32, 8, 128, 0, 1, 1, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 32, 8, 128, 0, 1, 1, 0, 1, 2 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 0, 1, 32, 2, 8, 8, 128, 8, 16, 128, 1, 1, 1, 0, 1, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 128, 8, 16, 128, 1, 1, 1, 0, 1, 8 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 1, 16, 2, 32, 32, 128, 16, 8, 128, 1, 1, 1, 1, 2, 4 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 0, 1, 32, 2, 8, 8, 128, 16, 32, 128, 1, 1, 1, 1, 4, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 1, 16, 2, 32, 16, 128, 8, 8, 128, 1, 1, 0, 1, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 32, 16, 128, 8, 8, 128, 1, 1, 0, 1, 2, 2 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"Mali-T760 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 32, 1, 1, 0, 0, 4, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 32, 1, 1, 0, 0, 4, 2 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 1, 8, 1, 1, 32, 32, 64, 8, 8, 128, 0, 0, 0, 0, 2, 4 } }, { Name{"Apple M2 Max "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 2, 2 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 1 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 0, 1, 16, 2, 8, 8, 128, 16, 8, 128, 0, 1, 1, 1, 1, 8 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 1, 4, 1, 1, 32, 32, 128, 4, 4, 128, 0, 0, 0, 0, 1, 1 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 0, 1, 32, 2, 32, 16, 64, 32, 8, 64, 0, 1, 1, 0, 1, 1 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1, 16, 1, 1, 2, 2, 64, 2, 2, 128, 0, 0, 0, 0, 8, 2 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 0, 1, 32, 8, 32, 32, 64, 32, 16, 64, 1, 1, 1, 0, 2, 2 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 0, 1, 32, 2, 16, 8, 128, 16, 8, 64, 0, 0, 1, 0, 1, 2 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 0, 1, 32, 2, 32, 8, 128, 8, 8, 128, 1, 1, 1, 1, 2, 8 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 0, 1, 16, 2, 8, 8, 128, 8, 8, 128, 1, 1, 1, 0, 1, 8 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 0, 1, 32, 8, 16, 16, 64, 32, 32, 64, 0, 1, 1, 0, 1, 2 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 0, 1, 32, 2, 16, 32, 32, 8, 8, 64, 0, 1, 0, 0, 1, 8 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1, 2, 1, 1, 16, 16, 64, 4, 4, 64, 0, 0, 0, 0, 1, 1 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 0, 1, 32, 2, 16, 8, 32, 8, 32, 128, 1, 1, 1, 1, 1, 4 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1, 1, 1, 1, 16, 16, 16, 8, 8, 64, 0, 0, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1, 16, 1, 1, 16, 16, 64, 8, 8, 64, 0, 0, 0, 0, 2, 8 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1, 16, 1, 1, 16, 16, 64, 4, 4, 32, 0, 0, 0, 0, 1, 8 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 0, 1, 32, 2, 8, 8, 128, 32, 16, 64, 0, 0, 1, 0, 4, 2 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 1, 32, 8, 8, 8, 64, 32, 16, 64, 1, 1, 1, 1, 4, 2 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 1, 4, 1, 1, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 0, 1, 32, 2, 32, 8, 64, 16, 16, 128, 0, 0, 0, 1, 1, 2 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 16, 128, 1, 1, 1, 1, 2, 4 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 2, 1, 1, 4, 4, 32, 8, 8, 64, 0, 0, 0, 0, 2, 2 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 0, 1, 16, 2, 8, 8, 64, 16, 8, 64, 1, 1, 0, 1, 1, 2 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 0, 1, 16, 2, 16, 16, 64, 16, 8, 128, 1, 1, 0, 1, 1, 8 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 4, 1, 1, 16, 16, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 0, 1, 32, 2, 8, 16, 64, 8, 8, 128, 1, 1, 0, 1, 1, 8 } }, { Name{"Iris "}, Params{ 0, 1, 16, 8, 16, 8, 128, 32, 16, 64, 1, 1, 1, 1, 4, 1 } }, { Name{"Iris Pro "}, Params{ 0, 1, 16, 2, 16, 8, 64, 32, 32, 128, 1, 1, 1, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 0, 1, 16, 2, 8, 16, 64, 8, 8, 64, 0, 0, 1, 0, 1, 4 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 0, 1, 32, 2, 32, 32, 32, 32, 8, 128, 0, 0, 1, 0, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 16, 32, 8, 8, 64, 0, 0, 1, 0, 1, 4 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 0, 1, 16, 2, 16, 8, 64, 32, 16, 64, 1, 1, 1, 1, 2, 2 } }, { Name{"GeForce GTX 580 "}, Params{ 0, 1, 16, 2, 32, 8, 128, 16, 32, 64, 1, 1, 1, 0, 4, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 32, 32, 64, 0, 0, 0, 0, 1, 2 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 16, 64, 1, 1, 1, 1, 2, 4 } }, { Name{"GeForce GT 650M "}, Params{ 0, 1, 32, 2, 8, 8, 32, 32, 32, 64, 1, 1, 0, 0, 4, 2 } }, { Name{"GeForce GTX 670 "}, Params{ 0, 1, 16, 2, 8, 8, 64, 16, 16, 64, 1, 1, 1, 0, 2, 4 } }, { Name{"GeForce GTX 680 "}, Params{ 0, 1, 32, 8, 8, 16, 64, 32, 16, 128, 1, 1, 0, 0, 4, 2 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 32, 32, 64, 1, 1, 0, 0, 4, 2 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 0, 1, 16, 2, 16, 8, 128, 16, 32, 128, 1, 1, 1, 0, 4, 1 } }, { Name{"GeForce GTX TITAN "}, Params{ 0, 1, 16, 8, 32, 16, 64, 8, 8, 64, 1, 1, 1, 0, 2, 2 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 0, 1, 16, 2, 16, 8, 64, 16, 16, 64, 1, 1, 1, 0, 4, 1 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 8, 64, 0, 1, 1, 0, 4, 2 } }, { Name{"Tesla K20m "}, Params{ 0, 1, 16, 2, 32, 16, 64, 16, 8, 64, 1, 1, 1, 0, 2, 4 } }, { Name{"Tesla K40m "}, Params{ 0, 1, 16, 8, 16, 8, 64, 16, 16, 128, 1, 1, 1, 0, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 0, 1, 16, 2, 32, 8, 128, 8, 16, 128, 1, 1, 1, 0, 4, 1 } }, { Name{"GeForce GTX 750 "}, Params{ 0, 1, 16, 2, 16, 16, 64, 32, 8, 128, 1, 1, 1, 1, 1, 2 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 0, 1, 16, 2, 16, 16, 128, 32, 8, 64, 1, 1, 0, 1, 8, 2 } }, { Name{"Quadro M2000M "}, Params{ 0, 1, 16, 2, 16, 16, 128, 32, 16, 128, 1, 1, 1, 1, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 32, 32, 64, 0, 0, 0, 0, 2, 1 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 2 } }, { Name{"GeForce GTX 980 "}, Params{ 0, 1, 16, 2, 16, 16, 64, 16, 8, 128, 1, 1, 1, 0, 4, 8 } }, { Name{"GeForce GTX TITAN X "}, Params{ 0, 1, 16, 2, 8, 16, 128, 8, 8, 128, 1, 1, 1, 1, 4, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 16, 32, 16, 8, 128, 1, 1, 1, 0, 1, 2 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 4 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 0, 1, 16, 2, 32, 16, 128, 32, 8, 128, 1, 1, 1, 0, 4, 1 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 1 } }, { Name{"GeForce GTX 1080 "}, Params{ 0, 1, 32, 2, 16, 8, 64, 8, 8, 64, 1, 1, 1, 1, 4, 8 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 0, 1, 16, 2, 32, 16, 64, 16, 8, 128, 1, 1, 0, 1, 2, 8 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 0, 1, 32, 2, 16, 16, 128, 32, 8, 128, 1, 1, 1, 1, 4, 4 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 0, 1, 32, 2, 16, 16, 128, 32, 8, 128, 1, 1, 1, 1, 4, 4 } }, { Name{"TITAN X (Pascal) "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 1 } }, { Name{"Tesla P4 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 8, 64, 0, 1, 1, 0, 4, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 4 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 32, 1, 1, 0, 0, 4, 4 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 32, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 32, 1, 1, 0, 0, 4, 4 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 0, 1, 16, 2, 32, 8, 128, 32, 16, 128, 1, 1, 1, 0, 4, 1 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 0, 1, 32, 2, 8, 8, 64, 32, 16, 128, 1, 1, 1, 1, 2, 2 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 0, 1, 16, 2, 32, 8, 128, 32, 16, 128, 1, 1, 1, 0, 4, 1 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 0, 1, 32, 2, 8, 8, 64, 32, 16, 128, 1, 1, 1, 1, 2, 2 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 8, 1, 1, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 8 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 0, 1, 16, 2, 8, 8, 32, 32, 16, 128, 0, 1, 1, 1, 2, 2 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 8, 1, 1, 16, 16, 64, 4, 4, 128, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 0, 1, 32, 2, 8, 32, 128, 16, 8, 128, 1, 1, 1, 1, 2, 2 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1, 16, 1, 1, 16, 16, 64, 8, 8, 128, 0, 0, 0, 0, 2, 2 } }, { Name{"Quadro T2000 "}, Params{ 0, 1, 16, 2, 16, 16, 128, 32, 16, 128, 1, 1, 1, 1, 1, 2 } }, { Name{"TITAN RTX "}, Params{ 0, 1, 32, 2, 8, 32, 128, 16, 8, 128, 1, 1, 1, 1, 2, 2 } }, { Name{"Tesla T4 "}, Params{ 0, 1, 16, 2, 16, 16, 128, 32, 16, 128, 1, 1, 1, 1, 1, 2 } }, { kDeviceNameDefault , Params{ 1, 4, 1, 1, 4, 4, 64, 8, 8, 32, 0, 0, 0, 0, 4, 4 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 0, 1, 32, 2, 8, 8, 64, 32, 16, 128, 1, 1, 1, 1, 2, 2 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 0, 1, 32, 2, 8, 8, 64, 32, 16, 128, 1, 1, 1, 1, 2, 2 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 0, 1, 16, 2, 8, 8, 32, 8, 16, 128, 1, 0, 1, 1, 4, 8 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 0, 1, 32, 2, 8, 8, 64, 32, 16, 128, 1, 1, 1, 1, 2, 2 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 0, 1, 16, 2, 8, 8, 32, 8, 16, 128, 1, 0, 1, 1, 4, 8 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 0, 1, 16, 2, 8, 8, 32, 32, 16, 128, 0, 1, 1, 1, 2, 2 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 0, 0, 0, 0, 2, 2 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 0, 1, 32, 2, 32, 16, 128, 8, 8, 64, 0, 0, 0, 0, 4, 8 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1, 8, 1, 1, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 8 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 0, 1, 16, 2, 8, 8, 32, 8, 16, 128, 1, 0, 1, 1, 4, 8 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 0, 1, 16, 2, 8, 8, 32, 8, 16, 128, 1, 0, 1, 1, 4, 8 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 0, 1, 32, 2, 32, 32, 128, 8, 8, 64, 0, 1, 1, 1, 4, 4 } }, { kDeviceNameDefault , Params{ 1, 8, 1, 1, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 8 } }, } }, { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 32, 1, 1, 0, 0, 2, 4 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 4 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 128, 0, 0, 1, 0, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 128, 0, 0, 1, 0, 2, 4 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 128, 0, 0, 1, 0, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 128, 0, 0, 1, 0, 2, 4 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 4 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemm/xgemm_3232.hpp000066400000000000000000000642311463263031500224300ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemmComplexSingle = { "Xgemm", Precision::kComplexSingle, {"GEMMK", "KREG", "KWG", "KWI", "MDIMA", "MDIMC", "MWG", "NDIMB", "NDIMC", "NWG", "SA", "SB", "STRM", "STRN", "VWM", "VWN"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 1, 4 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 0, 1, 16, 2, 16, 16, 32, 32, 16, 128, 1, 1, 1, 1, 1, 4 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 1, 2 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 64, 0, 0, 1, 1, 2, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 64, 1, 1, 0, 0, 2, 1 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 1 } }, { Name{"AMD Radeon R9 290X "}, Params{ 0, 1, 32, 2, 32, 8, 32, 8, 16, 32, 1, 0, 1, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 8, 32, 8, 16, 32, 1, 0, 0, 0, 1, 1 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 0, 1, 32, 2, 16, 8, 32, 32, 32, 128, 1, 0, 0, 1, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 8, 32, 32, 32, 128, 1, 0, 0, 1, 2, 4 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 0, 1, 16, 2, 8, 8, 32, 8, 8, 32, 0, 1, 1, 1, 4, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 32, 8, 8, 32, 0, 1, 1, 1, 4, 2 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 0, 1, 16, 2, 8, 8, 32, 8, 16, 32, 1, 0, 0, 1, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 32, 8, 16, 32, 1, 0, 0, 1, 2, 1 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 0, 1, 16, 2, 32, 8, 64, 16, 32, 64, 1, 1, 1, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 32, 8, 64, 16, 32, 64, 1, 1, 1, 0, 2, 1 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 0, 1, 16, 2, 8, 8, 32, 32, 8, 32, 0, 1, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 32, 32, 8, 32, 0, 1, 0, 0, 2, 1 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 32, 0, 0, 0, 0, 4, 4 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 0, 0, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 64, 1, 1, 0, 0, 1, 2 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 0, 1, 32, 2, 16, 8, 32, 16, 16, 64, 1, 1, 1, 0, 2, 2 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 0, 1, 32, 2, 8, 16, 32, 8, 8, 128, 0, 0, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 32, 0, 0, 0, 0, 4, 4 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 2, 1, 1, 16, 16, 32, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 0, 1, 32, 2, 16, 8, 32, 16, 16, 64, 1, 1, 1, 0, 2, 2 } }, { kDeviceNameDefault , Params{ 1, 4, 1, 1, 8, 8, 32, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 0, 1, 32, 2, 16, 8, 32, 32, 32, 64, 1, 0, 1, 1, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 8, 32, 32, 32, 64, 1, 0, 1, 1, 1, 2 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 0, 0, 0, 0, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 0, 0, 0, 0, 2, 4 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 0, 1, 16, 2, 8, 16, 64, 32, 8, 64, 1, 1, 1, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 16, 64, 32, 8, 64, 1, 1, 1, 0, 1, 2 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 0, 1, 16, 2, 8, 16, 64, 32, 8, 64, 1, 1, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 16, 64, 32, 8, 64, 1, 1, 0, 0, 2, 1 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 16, 64, 1, 0, 1, 1, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 8, 16, 64, 1, 0, 1, 1, 4, 4 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 0, 1, 16, 2, 8, 8, 16, 32, 16, 64, 1, 0, 1, 1, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 16, 32, 16, 64, 1, 0, 1, 1, 1, 1 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 0, 1, 16, 2, 32, 32, 64, 16, 8, 128, 0, 0, 1, 1, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 32, 32, 64, 16, 8, 128, 0, 0, 1, 1, 2, 2 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 0, 1, 16, 2, 8, 8, 16, 32, 16, 64, 1, 0, 1, 1, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 16, 32, 16, 64, 1, 0, 1, 1, 1, 1 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 1, 16, 2, 16, 16, 32, 32, 16, 128, 1, 1, 1, 1, 1, 4 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 64, 1, 1, 0, 0, 2, 4 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 1, 1 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 1 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 32, 1, 1, 0, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 32, 1, 1, 0, 0, 1, 2 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 0, 1, 32, 2, 32, 16, 64, 8, 8, 32, 0, 1, 1, 0, 2, 4 } }, { Name{"Apple M2 Max "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 16, 32, 0, 1, 0, 1, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 8, 16, 32, 0, 1, 0, 1, 1, 2 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 0, 1, 16, 2, 32, 8, 128, 16, 16, 128, 1, 1, 0, 1, 1, 2 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 1, 8, 1, 1, 16, 16, 16, 2, 2, 128, 0, 0, 0, 0, 1, 4 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 0, 1, 32, 2, 32, 32, 32, 16, 16, 128, 1, 0, 0, 0, 1, 1 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1, 16, 1, 1, 4, 4, 128, 2, 2, 64, 0, 0, 0, 0, 4, 8 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 0, 1, 32, 2, 32, 16, 32, 16, 16, 64, 0, 1, 1, 0, 1, 2 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 16, 64, 0, 1, 0, 0, 4, 4 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 0, 1, 32, 2, 8, 8, 128, 16, 32, 128, 0, 0, 0, 0, 1, 4 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 0, 1, 32, 2, 8, 8, 128, 32, 8, 128, 0, 0, 0, 0, 1, 4 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 0, 1, 32, 2, 8, 16, 16, 16, 16, 128, 0, 0, 1, 1, 1, 4 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 0, 1, 32, 2, 16, 16, 16, 8, 16, 128, 0, 1, 0, 0, 1, 8 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 0, 1, 32, 2, 32, 32, 32, 16, 16, 64, 1, 1, 0, 0, 1, 4 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 1, 2, 1, 1, 16, 16, 16, 8, 8, 64, 0, 0, 0, 0, 1, 1 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 32, 1, 1, 0, 0, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 32, 8, 8, 64, 1, 1, 0, 0, 1, 4 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1, 4, 1, 1, 16, 16, 64, 8, 8, 32, 0, 0, 0, 0, 2, 2 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1, 4, 1, 1, 4, 4, 16, 8, 8, 32, 0, 0, 0, 0, 2, 2 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 0, 1, 16, 8, 8, 16, 64, 32, 8, 32, 0, 0, 0, 0, 2, 1 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 1, 16, 8, 8, 8, 32, 16, 16, 64, 1, 0, 0, 0, 4, 4 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 8, 32, 0, 0, 1, 0, 1, 1 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 0, 1, 32, 8, 16, 16, 64, 16, 16, 64, 1, 1, 1, 1, 2, 1 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 4, 1, 1, 32, 32, 128, 16, 16, 128, 0, 0, 0, 0, 4, 1 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 2, 1, 1, 4, 4, 16, 8, 8, 64, 0, 0, 0, 0, 2, 2 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 4 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 1, 2, 1, 1, 4, 4, 16, 4, 4, 32, 0, 0, 0, 0, 2, 2 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 2, 1, 1, 16, 16, 64, 8, 8, 64, 0, 0, 0, 0, 2, 2 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 1, 2, 1, 1, 4, 4, 16, 8, 8, 64, 0, 0, 0, 0, 2, 2 } }, { Name{"Iris "}, Params{ 0, 1, 32, 8, 32, 16, 64, 8, 16, 64, 1, 0, 1, 0, 1, 1 } }, { Name{"Iris Pro "}, Params{ 0, 1, 16, 2, 8, 8, 32, 32, 8, 32, 1, 1, 1, 1, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 1, 4, 1, 1, 16, 16, 16, 16, 16, 64, 0, 0, 0, 0, 1, 1 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 0, 1, 32, 2, 32, 32, 32, 32, 16, 128, 1, 0, 0, 0, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 1, 1, 16, 16, 16, 16, 16, 64, 0, 0, 0, 0, 1, 1 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 0, 1, 16, 2, 16, 16, 32, 32, 16, 128, 0, 1, 1, 1, 2, 2 } }, { Name{"GeForce GTX 580 "}, Params{ 0, 1, 32, 2, 16, 8, 32, 32, 32, 128, 1, 0, 1, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 8, 32, 32, 16, 128, 0, 0, 1, 0, 1, 1 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 0, 1, 16, 8, 32, 32, 64, 32, 16, 128, 1, 0, 1, 0, 1, 4 } }, { Name{"GeForce GTX 670 "}, Params{ 0, 1, 16, 2, 32, 32, 64, 32, 8, 32, 1, 1, 1, 1, 1, 1 } }, { Name{"GeForce GTX 680 "}, Params{ 0, 1, 16, 2, 32, 16, 64, 32, 32, 128, 1, 0, 0, 0, 2, 2 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 0, 1, 16, 2, 32, 16, 64, 32, 8, 32, 0, 1, 1, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 16, 0, 0, 0, 0, 4, 1 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 0, 1, 32, 2, 8, 32, 128, 32, 8, 64, 0, 1, 0, 0, 4, 1 } }, { Name{"GeForce GTX TITAN "}, Params{ 0, 1, 16, 8, 16, 16, 64, 32, 16, 64, 1, 1, 1, 0, 1, 1 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 0, 1, 16, 2, 8, 16, 64, 8, 8, 32, 0, 1, 1, 0, 1, 2 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 0, 1, 16, 2, 16, 16, 64, 32, 8, 64, 0, 1, 1, 1, 4, 1 } }, { Name{"Tesla K20m "}, Params{ 0, 1, 32, 2, 8, 16, 64, 8, 16, 64, 1, 0, 0, 0, 1, 4 } }, { Name{"Tesla K40m "}, Params{ 0, 1, 16, 2, 32, 32, 32, 32, 8, 64, 0, 1, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 64, 0, 0, 0, 0, 1, 1 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 0, 1, 32, 2, 16, 8, 64, 8, 32, 128, 0, 0, 1, 0, 2, 2 } }, { Name{"GeForce GTX 750 "}, Params{ 0, 1, 16, 8, 16, 16, 64, 16, 16, 64, 1, 1, 1, 0, 2, 2 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 0, 1, 16, 2, 16, 8, 32, 32, 16, 64, 1, 1, 1, 0, 1, 2 } }, { Name{"Quadro M2000M "}, Params{ 0, 1, 16, 2, 16, 8, 128, 16, 32, 64, 1, 1, 1, 1, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 64, 1, 1, 0, 0, 2, 1 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 0, 1, 16, 2, 8, 16, 64, 8, 8, 64, 0, 0, 1, 0, 1, 4 } }, { Name{"GeForce GTX 980 "}, Params{ 0, 1, 32, 8, 32, 32, 64, 16, 16, 64, 1, 1, 1, 0, 2, 1 } }, { Name{"GeForce GTX TITAN X "}, Params{ 0, 1, 16, 2, 8, 8, 64, 8, 8, 32, 1, 0, 1, 1, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 64, 8, 8, 32, 0, 0, 1, 0, 1, 1 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 0, 1, 16, 2, 16, 16, 64, 32, 8, 64, 0, 1, 1, 1, 4, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 16, 64, 32, 8, 64, 0, 1, 1, 1, 4, 1 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 0, 1, 16, 2, 16, 16, 128, 16, 16, 64, 1, 1, 1, 1, 2, 4 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 0, 1, 16, 2, 16, 8, 128, 16, 32, 64, 1, 1, 1, 1, 1, 2 } }, { Name{"GeForce GTX 1080 "}, Params{ 0, 1, 16, 2, 32, 16, 64, 32, 8, 64, 1, 1, 0, 0, 1, 2 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 0, 1, 16, 2, 8, 16, 32, 16, 8, 64, 1, 1, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 0, 1, 32, 2, 32, 32, 64, 32, 8, 64, 1, 1, 0, 1, 2, 1 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 0, 1, 16, 2, 32, 16, 64, 8, 8, 64, 1, 1, 1, 0, 1, 1 } }, { Name{"TITAN X (Pascal) "}, Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 32, 1, 1, 0, 0, 2, 4 } }, { Name{"Tesla P4 "}, Params{ 0, 1, 32, 2, 32, 32, 64, 16, 16, 64, 1, 1, 0, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 64, 1, 1, 0, 0, 2, 4 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 64, 0, 0, 0, 0, 4, 4 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 64, 0, 0, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 64, 0, 0, 0, 0, 4, 4 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 0, 1, 16, 2, 8, 8, 16, 16, 16, 64, 1, 0, 1, 1, 2, 2 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 0, 1, 16, 2, 32, 8, 32, 16, 32, 128, 1, 0, 0, 0, 1, 4 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 2, 4 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 0, 1, 16, 2, 16, 8, 64, 8, 16, 64, 1, 1, 1, 1, 1, 4 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 0, 1, 16, 2, 8, 8, 32, 32, 16, 128, 0, 1, 1, 1, 2, 2 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 0, 1, 32, 2, 8, 16, 64, 32, 8, 32, 0, 0, 1, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 16, 1, 1, 16, 16, 64, 4, 4, 32, 0, 0, 0, 0, 1, 8 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 4, 1, 1, 32, 32, 128, 8, 8, 64, 0, 0, 0, 0, 1, 2 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 4, 1, 1, 8, 8, 32, 16, 16, 128, 0, 0, 0, 0, 4, 2 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 0, 1, 16, 2, 8, 8, 128, 16, 16, 64, 1, 0, 1, 0, 1, 1 } }, { Name{"Quadro T2000 "}, Params{ 0, 1, 32, 2, 16, 8, 64, 8, 32, 128, 0, 0, 1, 0, 2, 2 } }, { Name{"TITAN RTX "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 64, 1, 0, 0, 1, 2, 2 } }, { Name{"Tesla T4 "}, Params{ 0, 1, 32, 2, 16, 8, 64, 8, 32, 128, 0, 0, 1, 0, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 0, 0, 0, 0, 1, 1 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 32, 1, 1, 0, 0, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 32, 1, 1, 0, 0, 2, 2 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 0, 1, 16, 2, 16, 8, 64, 8, 16, 64, 1, 1, 1, 1, 1, 4 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 128, 0, 1, 1, 1, 4, 1 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 0, 1, 16, 2, 16, 8, 64, 8, 16, 64, 1, 1, 1, 1, 1, 4 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 128, 0, 1, 1, 1, 4, 1 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 1, 8, 1, 1, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 8 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 1, 16, 1, 1, 16, 16, 64, 4, 4, 32, 0, 0, 0, 0, 1, 8 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 0, 1, 32, 2, 32, 8, 128, 16, 16, 64, 1, 0, 1, 0, 2, 1 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 1, 4, 1, 1, 8, 8, 32, 16, 16, 128, 0, 0, 0, 0, 4, 2 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 1, 4, 1, 1, 4, 4, 32, 8, 8, 64, 0, 0, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 0, 0, 0, 0, 1, 2 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 0, 1, 16, 2, 16, 8, 64, 8, 16, 64, 1, 1, 1, 1, 1, 4 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 0, 1, 16, 2, 8, 8, 32, 32, 16, 128, 0, 1, 1, 1, 2, 2 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 0, 1, 16, 2, 8, 8, 128, 16, 16, 64, 1, 0, 1, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1, 16, 1, 1, 16, 16, 64, 4, 4, 32, 0, 0, 0, 0, 1, 8 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 0, 1, 16, 2, 16, 8, 64, 8, 16, 64, 1, 1, 1, 1, 1, 4 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1, 16, 1, 1, 16, 16, 64, 4, 4, 32, 0, 0, 0, 0, 1, 8 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 0, 1, 16, 2, 16, 8, 64, 8, 16, 64, 1, 1, 1, 1, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 0, 0, 0, 0, 1, 2 } }, } }, { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 1, 1, 0, 0, 1, 1 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 0, 0, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 0, 0, 0, 0, 1, 1 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 64, 0, 0, 0, 0, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 64, 0, 0, 0, 0, 1, 4 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 32, 0, 0, 0, 0, 1, 1 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemm/xgemm_64.hpp000066400000000000000000000542231463263031500222700ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemmDouble = { "Xgemm", Precision::kDouble, {"GEMMK", "KREG", "KWG", "KWI", "MDIMA", "MDIMC", "MWG", "NDIMB", "NDIMC", "NWG", "SA", "SB", "STRM", "STRN", "VWM", "VWN"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 64, 1, 1, 0, 0, 2, 2 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 0, 1, 16, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 1, 2, 4 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 2 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 2, 2 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 2 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 32, 0, 0, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 2 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 0, 1, 32, 2, 16, 32, 64, 32, 8, 64, 1, 1, 1, 1, 2, 2 } }, { Name{"AMD Radeon R9 290X "}, Params{ 0, 1, 16, 8, 32, 8, 128, 8, 8, 32, 0, 1, 0, 0, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 8, 64, 8, 8, 32, 0, 1, 0, 0, 1, 2 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 0, 1, 16, 2, 8, 16, 64, 16, 8, 16, 0, 0, 1, 1, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 16, 64, 16, 8, 16, 0, 0, 1, 1, 1, 1 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 0, 1, 32, 2, 32, 16, 64, 8, 16, 32, 0, 0, 0, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 16, 64, 8, 16, 32, 0, 0, 0, 0, 1, 2 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 0, 1, 32, 2, 16, 8, 16, 8, 8, 32, 0, 0, 0, 1, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 8, 16, 8, 8, 32, 0, 0, 0, 1, 1, 4 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 2 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 1, 2, 1, 1, 16, 16, 32, 8, 8, 32, 0, 0, 0, 0, 2, 2 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 64, 1, 1, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 2 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 32, 1, 1, 0, 0, 4, 2 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 32, 1, 1, 0, 0, 4, 2 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 64, 1, 1, 0, 0, 2, 4 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 32, 1, 1, 0, 0, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 32, 1, 1, 0, 0, 4, 1 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 0, 1, 32, 2, 32, 32, 32, 8, 8, 32, 1, 1, 0, 0, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 32, 8, 8, 32, 1, 1, 0, 0, 1, 4 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 0, 1, 16, 2, 8, 8, 64, 8, 16, 128, 1, 1, 0, 1, 8, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 64, 8, 16, 128, 1, 1, 0, 1, 8, 2 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 0, 1, 32, 2, 16, 8, 32, 8, 8, 16, 0, 1, 0, 1, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 8, 32, 8, 8, 16, 0, 1, 0, 1, 1, 2 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 16, 1, 1, 0, 0, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 16, 1, 1, 0, 0, 2, 2 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 1, 1, 0, 0, 4, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 1, 1, 0, 0, 4, 2 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 1 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 1, 16, 2, 16, 16, 32, 32, 16, 128, 1, 1, 1, 1, 1, 4 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 1, 16, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 1, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 1, 2, 4 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 4 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 0, 1, 16, 2, 32, 8, 128, 16, 16, 128, 1, 1, 1, 1, 2, 8 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 16, 1, 1, 0, 0, 1, 2 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 0, 1, 32, 2, 16, 8, 128, 16, 8, 128, 1, 0, 1, 1, 1, 8 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1, 8, 1, 1, 2, 2, 128, 2, 2, 64, 0, 0, 0, 0, 2, 1 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 0, 1, 32, 2, 32, 16, 128, 16, 16, 64, 0, 1, 1, 0, 1, 2 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 0, 1, 32, 2, 32, 16, 128, 16, 16, 128, 0, 0, 1, 0, 1, 2 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 0, 1, 32, 2, 16, 8, 128, 8, 8, 64, 1, 0, 0, 1, 2, 8 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 0, 1, 32, 2, 16, 8, 128, 8, 8, 128, 1, 0, 0, 0, 2, 8 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 0, 1, 32, 2, 8, 16, 128, 16, 8, 128, 0, 0, 1, 1, 1, 8 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 0, 1, 32, 2, 8, 16, 64, 16, 8, 64, 0, 1, 1, 0, 1, 4 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1, 2, 1, 1, 8, 8, 32, 4, 4, 64, 0, 0, 0, 0, 1, 1 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 1, 2, 1, 1, 16, 16, 16, 8, 8, 64, 0, 0, 0, 0, 1, 1 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1, 2, 1, 1, 16, 16, 16, 4, 4, 16, 0, 0, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 64, 1, 1, 0, 0, 1, 4 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 8, 32, 0, 0, 1, 0, 1, 1 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 0, 0, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 64, 0, 0, 0, 0, 1, 4 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 0, 1, 32, 8, 8, 16, 16, 16, 16, 128, 0, 0, 1, 0, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 8, 8, 16, 16, 16, 16, 128, 0, 0, 1, 0, 1, 4 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 0, 1, 16, 2, 8, 16, 32, 32, 8, 64, 1, 1, 1, 0, 1, 2 } }, { Name{"GeForce GTX 580 "}, Params{ 0, 1, 32, 2, 32, 16, 64, 8, 8, 32, 0, 1, 1, 1, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 16, 32, 8, 8, 32, 0, 1, 1, 0, 1, 2 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 0, 1, 16, 2, 8, 8, 16, 8, 8, 32, 1, 0, 0, 1, 2, 2 } }, { Name{"GeForce GTX 670 "}, Params{ 0, 1, 32, 8, 16, 32, 128, 16, 8, 32, 0, 1, 1, 0, 1, 1 } }, { Name{"GeForce GTX 680 "}, Params{ 0, 1, 32, 8, 8, 8, 32, 16, 32, 128, 1, 0, 0, 1, 2, 4 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 0, 1, 16, 2, 8, 16, 32, 16, 8, 32, 1, 0, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 32, 32, 32, 0, 0, 0, 0, 2, 1 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 0, 1, 32, 2, 8, 8, 64, 16, 16, 64, 0, 0, 0, 0, 4, 2 } }, { Name{"GeForce GTX TITAN "}, Params{ 0, 1, 16, 8, 16, 8, 32, 16, 32, 128, 1, 1, 1, 1, 2, 2 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 0, 1, 16, 2, 16, 8, 16, 16, 8, 16, 1, 1, 1, 0, 1, 1 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 2, 2 } }, { Name{"Tesla K20m "}, Params{ 0, 1, 16, 2, 32, 8, 32, 16, 16, 64, 1, 0, 0, 0, 1, 1 } }, { Name{"Tesla K40m "}, Params{ 0, 1, 32, 2, 16, 8, 64, 16, 32, 128, 1, 0, 1, 1, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 2, 2 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 32, 0, 0, 0, 0, 1, 2 } }, { Name{"GeForce GTX 750 "}, Params{ 0, 1, 32, 8, 16, 32, 64, 16, 8, 128, 0, 0, 0, 1, 2, 1 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 32, 0, 0, 0, 0, 4, 2 } }, { Name{"Quadro M2000M "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 0, 0, 0, 0, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 0, 0, 0, 0, 2, 2 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 32, 0, 0, 0, 0, 2, 1 } }, { Name{"GeForce GTX 980 "}, Params{ 0, 1, 32, 8, 16, 8, 64, 32, 32, 128, 0, 0, 1, 0, 2, 4 } }, { Name{"GeForce GTX TITAN X "}, Params{ 0, 1, 16, 8, 16, 16, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 0, 1, 16, 2, 16, 16, 64, 32, 8, 64, 0, 1, 1, 1, 4, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 16, 64, 32, 8, 64, 0, 1, 1, 1, 4, 1 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 0, 1, 16, 2, 8, 16, 32, 8, 8, 64, 0, 0, 1, 1, 2, 8 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 32, 0, 0, 0, 0, 1, 2 } }, { Name{"GeForce GTX 1080 "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 64, 0, 0, 0, 0, 2, 4 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 0, 1, 16, 2, 16, 16, 16, 16, 16, 64, 0, 0, 1, 0, 1, 4 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 1, 16, 1, 1, 8, 8, 16, 4, 4, 16, 0, 0, 0, 0, 2, 4 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"TITAN X (Pascal) "}, Params{ 0, 1, 32, 2, 32, 32, 32, 16, 16, 32, 0, 0, 0, 0, 1, 2 } }, { Name{"Tesla P4 "}, Params{ 1, 2, 1, 1, 16, 16, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 32, 32, 64, 0, 0, 0, 0, 2, 2 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 2, 4 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 2, 4 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1, 8, 1, 1, 4, 4, 32, 32, 32, 128, 0, 0, 0, 0, 4, 4 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 32, 1, 1, 0, 0, 1, 4 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 1, 2, 1, 1, 2, 2, 16, 16, 16, 64, 0, 0, 0, 0, 1, 2 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 0, 0, 0, 0, 2, 2 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 8, 1, 1, 32, 32, 32, 4, 4, 32, 0, 0, 0, 0, 1, 8 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1, 16, 1, 1, 4, 4, 32, 8, 8, 16, 0, 0, 0, 0, 1, 2 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 16, 1, 1, 4, 4, 32, 8, 8, 16, 0, 0, 0, 0, 1, 2 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 8, 1, 1, 32, 32, 32, 4, 4, 32, 0, 0, 0, 0, 1, 8 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 1, 1, 1, 4, 4, 8, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"Quadro T2000 "}, Params{ 0, 1, 32, 2, 8, 16, 128, 8, 8, 64, 0, 0, 1, 1, 4, 4 } }, { Name{"TITAN RTX "}, Params{ 1, 2, 1, 1, 16, 16, 32, 8, 8, 16, 0, 0, 0, 0, 2, 1 } }, { Name{"Tesla T4 "}, Params{ 0, 1, 32, 2, 8, 16, 128, 8, 8, 64, 0, 0, 1, 1, 4, 4 } }, { kDeviceNameDefault , Params{ 1, 1, 1, 1, 8, 8, 32, 32, 32, 32, 0, 0, 0, 0, 1, 1 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 32, 1, 1, 0, 0, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 32, 1, 1, 0, 0, 2, 2 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 0, 1, 32, 2, 32, 32, 64, 16, 16, 64, 0, 1, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1, 8, 1, 1, 32, 32, 32, 4, 4, 32, 0, 0, 0, 0, 1, 8 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 0, 1, 16, 2, 16, 16, 16, 8, 8, 64, 1, 0, 0, 0, 1, 4 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 32, 1, 1, 0, 0, 2, 2 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 1, 16, 1, 1, 16, 16, 32, 8, 8, 8, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 0, 1, 16, 2, 16, 16, 16, 8, 8, 64, 1, 0, 0, 0, 1, 4 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1, 16, 1, 1, 8, 8, 16, 4, 4, 16, 0, 0, 0, 0, 2, 4 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 1, 1, 0, 0, 1, 1 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 1, 1, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 1, 1, 0, 0, 1, 1 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1, 8, 1, 1, 8, 8, 16, 32, 32, 64, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1, 8, 1, 1, 8, 8, 16, 32, 32, 64, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1, 16, 1, 1, 16, 16, 32, 8, 8, 8, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1, 16, 1, 1, 16, 16, 32, 8, 8, 8, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1, 16, 1, 1, 8, 8, 16, 4, 4, 16, 0, 0, 0, 0, 2, 4 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1, 16, 1, 1, 8, 8, 16, 4, 4, 16, 0, 0, 0, 0, 2, 4 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 4, 1, 1, 32, 32, 128, 16, 16, 64, 0, 0, 0, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 1, 8, 1, 1, 32, 32, 32, 4, 4, 32, 0, 0, 0, 0, 1, 8 } }, } }, { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 1, 1, 0, 0, 2, 2 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 32, 1, 1, 0, 0, 1, 4 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemm/xgemm_6464.hpp000066400000000000000000000533511463263031500224430ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemmComplexDouble = { "Xgemm", Precision::kComplexDouble, {"GEMMK", "KREG", "KWG", "KWI", "MDIMA", "MDIMC", "MWG", "NDIMB", "NDIMC", "NWG", "SA", "SB", "STRM", "STRN", "VWM", "VWN"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 16, 1, 1, 0, 0, 1, 1 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 1 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 16, 1, 1, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 16, 1, 1, 0, 0, 1, 1 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 16, 1, 1, 0, 0, 1, 1 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 0, 1, 32, 8, 8, 16, 32, 16, 16, 32, 0, 0, 1, 1, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 0, 0, 0, 0, 2, 4 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 1 } }, { Name{"AMD Radeon R9 290X "}, Params{ 0, 1, 16, 2, 16, 16, 16, 16, 16, 32, 1, 0, 0, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 16, 1, 1, 0, 0, 2, 1 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 0, 1, 16, 2, 16, 8, 16, 16, 32, 128, 0, 0, 0, 0, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 8, 16, 16, 32, 128, 0, 0, 0, 0, 1, 4 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 0, 1, 32, 2, 16, 8, 32, 8, 32, 32, 0, 1, 1, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 8, 32, 8, 32, 32, 0, 1, 1, 0, 1, 1 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 0, 1, 16, 2, 16, 8, 16, 8, 8, 16, 0, 0, 1, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 8, 16, 8, 8, 16, 0, 0, 1, 0, 1, 1 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 0, 1, 16, 2, 32, 16, 32, 16, 16, 16, 1, 1, 1, 1, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 32, 16, 32, 16, 16, 16, 1, 1, 1, 1, 1, 1 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 1, 2, 1, 1, 16, 16, 32, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 32, 0, 0, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 32, 8, 8, 32, 0, 0, 0, 0, 1, 2 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 2, 1 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 32, 1, 1, 0, 0, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 32, 1, 1, 0, 0, 2, 2 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 16, 1, 1, 0, 1, 2, 2 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 32, 1, 1, 0, 0, 1, 2 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 32, 1, 1, 0, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 32, 1, 1, 0, 0, 1, 2 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 0, 1, 16, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 1, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 1, 4, 4 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 1, 2 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1, 2, 1, 1, 4, 4, 32, 8, 8, 32, 0, 0, 0, 0, 8, 2 } }, { kDeviceNameDefault , Params{ 1, 2, 1, 1, 4, 4, 32, 8, 8, 32, 0, 0, 0, 0, 8, 2 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 16, 1, 1, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 16, 1, 1, 0, 0, 2, 1 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 16, 1, 1, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 16, 16, 16, 1, 1, 0, 0, 2, 1 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 0, 1, 16, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 1, 4, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 1, 4, 1 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 1, 4 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 16, 1, 1, 0, 0, 1, 1 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 16, 1, 1, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 16, 1, 1, 0, 0, 1, 1 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 32, 128, 0, 0, 0, 1, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 32, 128, 0, 0, 0, 1, 1, 2 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 0, 1, 32, 2, 32, 16, 64, 8, 16, 32, 1, 0, 1, 1, 2, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 8, 16, 8, 16, 64, 1, 0, 1, 0, 1, 1 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 16, 1, 1, 0, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 16, 1, 1, 0, 0, 1, 2 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 0, 1, 16, 2, 32, 8, 64, 16, 8, 128, 0, 1, 0, 1, 2, 1 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 0, 1, 32, 2, 8, 8, 32, 16, 32, 128, 1, 0, 1, 0, 4, 1 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1, 16, 1, 1, 4, 4, 128, 2, 2, 64, 0, 0, 0, 0, 4, 8 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 0, 1, 32, 2, 16, 32, 128, 16, 16, 64, 0, 1, 0, 0, 2, 4 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 0, 1, 32, 2, 16, 32, 128, 16, 8, 32, 0, 1, 0, 0, 4, 1 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 0, 1, 32, 2, 8, 8, 128, 8, 16, 128, 0, 0, 0, 1, 1, 8 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 0, 1, 32, 2, 8, 8, 128, 32, 8, 128, 0, 0, 0, 0, 1, 4 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 0, 1, 32, 8, 8, 32, 32, 8, 8, 32, 0, 1, 0, 0, 1, 2 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 0, 1, 32, 2, 32, 8, 128, 16, 8, 128, 0, 0, 1, 1, 1, 4 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1, 2, 1, 1, 8, 8, 16, 4, 4, 64, 0, 0, 0, 0, 2, 2 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 0, 1, 16, 2, 16, 32, 64, 32, 8, 64, 0, 1, 0, 0, 2, 2 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 0, 1, 32, 2, 32, 32, 32, 8, 8, 32, 1, 1, 0, 0, 1, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 2, 2 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 32, 0, 0, 0, 0, 2, 1 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 1, 1, 1, 8, 8, 32, 8, 8, 32, 0, 0, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 32, 0, 0, 0, 0, 1, 4 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 0, 1, 32, 2, 16, 16, 16, 16, 8, 32, 0, 0, 1, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 16, 16, 8, 32, 0, 0, 1, 0, 1, 1 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 0, 1, 16, 2, 32, 32, 32, 32, 8, 32, 0, 0, 1, 0, 1, 1 } }, { Name{"GeForce GTX 580 "}, Params{ 0, 1, 32, 2, 32, 32, 32, 8, 8, 64, 0, 0, 0, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 32, 32, 32, 8, 8, 32, 0, 0, 0, 0, 1, 1 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 0, 1, 32, 8, 16, 16, 16, 8, 16, 64, 1, 0, 1, 1, 1, 1 } }, { Name{"GeForce GTX 670 "}, Params{ 0, 1, 32, 8, 16, 8, 16, 16, 32, 64, 1, 0, 0, 1, 1, 2 } }, { Name{"GeForce GTX 680 "}, Params{ 0, 1, 16, 8, 16, 8, 64, 16, 32, 32, 0, 1, 1, 0, 1, 1 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 0, 0, 0, 0, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 16, 32, 16, 8, 128, 0, 1, 1, 0, 1, 2 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 32, 0, 0, 0, 0, 4, 1 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 0, 1, 16, 2, 16, 16, 32, 16, 8, 32, 0, 1, 1, 1, 1, 1 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 0, 0, 0, 0, 2, 4 } }, { Name{"Tesla K20m "}, Params{ 0, 1, 32, 2, 32, 8, 32, 16, 16, 64, 0, 0, 1, 0, 1, 1 } }, { Name{"Tesla K40m "}, Params{ 0, 1, 16, 8, 8, 8, 32, 32, 16, 32, 0, 0, 1, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 32, 0, 0, 0, 0, 1, 1 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 0, 1, 32, 2, 8, 8, 64, 8, 8, 32, 0, 0, 0, 0, 1, 2 } }, { Name{"GeForce GTX 750 "}, Params{ 0, 1, 32, 2, 8, 32, 32, 8, 8, 64, 0, 0, 1, 0, 1, 4 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 32, 0, 0, 0, 0, 1, 1 } }, { Name{"Quadro M2000M "}, Params{ 1, 1, 1, 1, 8, 8, 32, 32, 32, 32, 0, 0, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 0, 0, 0, 0, 1, 1 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 32, 0, 0, 0, 0, 1, 1 } }, { Name{"GeForce GTX 980 "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 16, 128, 0, 0, 1, 1, 2, 2 } }, { Name{"GeForce GTX TITAN X "}, Params{ 0, 1, 32, 8, 16, 16, 128, 16, 16, 32, 0, 0, 1, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 16, 8, 16, 32, 0, 0, 0, 0, 1, 1 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 0, 1, 16, 2, 8, 8, 16, 32, 16, 32, 1, 1, 1, 1, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 16, 32, 16, 32, 1, 1, 1, 1, 2, 1 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 0, 1, 32, 8, 32, 16, 32, 8, 8, 32, 0, 0, 0, 1, 1, 4 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 32, 0, 0, 0, 0, 1, 1 } }, { Name{"GeForce GTX 1080 "}, Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 16, 0, 0, 0, 0, 1, 2 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 1, 2, 1, 1, 8, 8, 32, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"TITAN X (Pascal) "}, Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"Tesla P4 "}, Params{ 1, 4, 1, 1, 16, 16, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 32, 32, 32, 64, 0, 0, 0, 0, 1, 2 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 0, 1, 16, 2, 8, 8, 16, 8, 16, 32, 0, 1, 0, 1, 1, 2 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1, 1, 1, 1, 16, 16, 32, 8, 8, 32, 0, 0, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 1, 2, 1, 1, 16, 16, 32, 8, 8, 16, 0, 0, 0, 0, 2, 1 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 32, 0, 0, 0, 0, 1, 2 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 1, 1, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 1, 8, 1, 1, 2, 2, 16, 32, 32, 128, 0, 0, 0, 0, 4, 4 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 1, 16, 1, 1, 8, 8, 16, 4, 4, 16, 0, 0, 0, 0, 2, 4 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 16, 0, 0, 0, 0, 1, 2 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1, 16, 1, 1, 16, 16, 32, 8, 8, 8, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 1, 1, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 1, 1, 0, 0, 1, 1 } }, { Name{"Quadro T2000 "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 2 } }, { Name{"TITAN RTX "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"Tesla T4 "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 64, 0, 0, 0, 0, 1, 1 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 0, 1, 16, 2, 8, 8, 16, 32, 16, 32, 1, 1, 1, 1, 2, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 16, 2, 8, 8, 16, 32, 16, 32, 1, 1, 1, 1, 2, 1 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 1, 1, 0, 0, 1, 2 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 1, 1, 0, 0, 2, 1 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 1, 1, 0, 0, 1, 2 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1, 16, 1, 1, 16, 16, 32, 8, 8, 8, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 1, 1, 0, 0, 1, 2 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 1, 1, 0, 0, 2, 1 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 1, 1, 0, 0, 2, 1 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 1, 1, 0, 0, 1, 1 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 1, 1, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 1, 1, 1, 1, 8, 8, 32, 32, 32, 32, 0, 0, 0, 0, 1, 1 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 16, 1, 1, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1, 16, 1, 1, 16, 16, 32, 8, 8, 8, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1, 2, 1, 1, 8, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 32, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 16, 1, 1, 0, 0, 2, 2 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 1, 1, 1, 16, 16, 128, 16, 16, 16, 0, 0, 0, 0, 4, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, } }, { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 8, 8, 16, 0, 0, 0, 0, 2, 1 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 32, 8, 8, 32, 0, 0, 0, 0, 1, 2 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemm_direct/000077500000000000000000000000001463263031500214555ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/xgemm_direct/xgemm_direct.cpp000066400000000000000000000015231463263031500246310ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm_Direct' kernels. // // ================================================================================================= #include "database/kernels/xgemm_direct/xgemm_direct.hpp" #include "database/kernels/xgemm_direct/xgemm_direct_16.hpp" #include "database/kernels/xgemm_direct/xgemm_direct_32.hpp" #include "database/kernels/xgemm_direct/xgemm_direct_3232.hpp" #include "database/kernels/xgemm_direct/xgemm_direct_64.hpp" #include "database/kernels/xgemm_direct/xgemm_direct_6464.hpp" CLBlast-1.6.3/src/database/kernels/xgemm_direct/xgemm_direct.hpp000066400000000000000000000015341463263031500246400ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm_Direct' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry XgemmDirectHalf; extern const DatabaseEntry XgemmDirectSingle; extern const DatabaseEntry XgemmDirectComplexSingle; extern const DatabaseEntry XgemmDirectDouble; extern const DatabaseEntry XgemmDirectComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemm_direct/xgemm_direct_16.hpp000066400000000000000000000230761463263031500251530ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm_Direct16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemmDirectHalf = { "XgemmDirect", Precision::kHalf, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 8, 32, 8, 8, 32, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 2, 16, 16, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 16, 8, 8, 16, 8, 1, 0, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 8, 8, 16, 8, 0, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 8, 16, 8, 0, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 2, 16, 16, 16, 16, 1, 1, 2, 4, 64, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 2, 4, 64, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 16, 16, 32, 16, 8, 1, 1, 2, 1, 64, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 32, 16, 8, 1, 1, 2, 1, 64, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 2, 16, 8, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 16, 16, 32, 8, 8, 0, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T628 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 8, 32, 32, 16, 8, 1, 0, 1, 4, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 2, 32, 32, 16, 16, 0, 0, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 16, 32, 16, 16, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 8, 16, 16, 8, 8, 0, 0, 4, 2, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 2, 8, 16, 8, 8, 1, 1, 4, 4, 64, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 2, 8, 16, 8, 8, 0, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 16, 8, 8, 0, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp000066400000000000000000000560531463263031500251520ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm_Direct32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemmDirectSingle = { "XgemmDirect", Precision::kSingle, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 2, 8, 8, 32, 32, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 2, 8, 8, 32, 32, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 32, 32, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 2, 8, 8, 32, 32, 1, 1, 4, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 32, 32, 1, 1, 4, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 16, 16, 16, 32, 8, 0, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 16, 32, 8, 0, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 8, 8, 16, 8, 8, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 16, 8, 8, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 2, 8, 8, 16, 16, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 2, 16, 16, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 2, 16, 16, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 2, 8, 8, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 8, 16, 16, 32, 16, 1, 1, 4, 1, 64, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 32, 16, 1, 1, 4, 1, 64, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 8, 8, 8, 8, 8, 1, 1, 8, 4, 64, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 8, 8, 8, 1, 1, 8, 4, 64, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 2, 16, 8, 8, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 8, 8, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 16, 8, 16, 8, 1, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 2, 16, 16, 32, 8, 1, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 16, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 16, 8, 16, 8, 1, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 16, 16, 8, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 2, 8, 8, 8, 8, 0, 0, 1, 8, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 8, 16, 16, 16, 16, 0, 0, 1, 1, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 8, 8, 8, 8, 8, 0, 0, 8, 4, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 2, 32, 32, 32, 32, 0, 0, 1, 1, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 16, 16, 8, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 2, 8, 8, 8, 8, 0, 0, 2, 2, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 2, 8, 8, 16, 8, 0, 0, 4, 4, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 8, 32, 32, 32, 16, 0, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 16, 8, 16, 16, 16, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 16, 8, 16, 16, 16, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 8, 32, 32, 16, 8, 1, 0, 1, 4, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 2, 32, 32, 16, 16, 0, 0, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 16, 32, 16, 8, 1, 0, 1, 1, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 8, 16, 16, 8, 8, 0, 0, 4, 2, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 8, 32, 16, 8, 8, 1, 0, 2, 8, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 2, 8, 32, 8, 8, 0, 1, 2, 1, 64, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 32, 8, 8, 0, 1, 2, 1, 64, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 580 "}, Params{ 2, 16, 8, 32, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 8, 32, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GeForce GT 650M "}, Params{ 16, 16, 16, 8, 16, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 16, 32, 8, 16, 16, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 16, 32, 8, 32, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 16, 8, 16, 32, 8, 1, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 2, 16, 8, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 2, 8, 8, 32, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 32, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 16, 8, 8, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 8, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 2, 16, 8, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 16, 16, 8, 16, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 16, 8, 8, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 8, 32, 8, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 8, 32, 8, 8, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 16, 8, 16, 8, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 2, 16, 8, 8, 16, 1, 1, 2, 1, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 2, 16, 8, 8, 16, 1, 1, 2, 1, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 2, 16, 8, 8, 16, 1, 1, 2, 1, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 2, 16, 8, 8, 16, 1, 1, 2, 1, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 16, 8, 16, 8, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 2, 16, 8, 8, 16, 1, 1, 2, 1, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 8, 16, 8, 16, 8, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 16, 8, 16, 32, 8, 1, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 8, 16, 8, 16, 8, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 8, 8, 16, 1, 1, 2, 1, 64, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 8, 8, 16, 32, 16, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 16, 32, 16, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 8, 32, 32, 8, 8, 1, 0, 2, 4, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 16, 16, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp000066400000000000000000000532031463263031500253110ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm_Direct3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemmDirectComplexSingle = { "XgemmDirect", Precision::kComplexSingle, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 580 2048SP "}, Params{ 2, 8, 8, 16, 16, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 2, 32, 32, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 2, 8, 8, 16, 16, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 8, 8, 16, 8, 16, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 8, 8, 8, 8, 16, 1, 1, 4, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 8, 8, 16, 1, 1, 4, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 2, 16, 16, 16, 16, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 2, 8, 16, 8, 16, 1, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 16, 8, 16, 1, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 2, 16, 16, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 8, 8, 8, 8, 16, 1, 0, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 8, 8, 16, 1, 0, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 2, 8, 16, 16, 8, 0, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 16, 16, 0, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 2, 8, 8, 8, 8, 0, 0, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 2, 16, 8, 16, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 16, 32, 32, 8, 8, 1, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 8, 8, 8, 8, 8, 0, 0, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 2, 8, 8, 16, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 2, 32, 32, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 8, 8, 16, 8, 16, 0, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 2, 8, 8, 16, 16, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 2, 8, 8, 8, 16, 0, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 2, 16, 16, 16, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 2, 16, 16, 32, 8, 1, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 2, 32, 32, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 32, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 580 "}, Params{ 2, 16, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GeForce GTX 760 Ti OEM "}, Params{ 16, 16, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 8, 16, 8, 16, 8, 1, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 8, 16, 16, 16, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 16, 8, 8, 16, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 8, 8, 16, 8, 8, 1, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 16, 16, 8, 8, 16, 1, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 8, 8, 16, 1, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 8, 8, 16, 16, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 2, 16, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 8, 8, 16, 8, 8, 1, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 16, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 16, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 2, 32, 32, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 8, 16, 8, 16, 8, 1, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 16, 8, 16, 8, 1, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 2, 8, 16, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 16, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 16, 16, 16, 16, 8, 1, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 16, 8, 16, 8, 1, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 8, 16, 8, 16, 8, 1, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 16, 8, 8, 8, 0, 0, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 8, 8, 0, 0, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp000066400000000000000000000443651463263031500251620ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm_Direct64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemmDirectDouble = { "XgemmDirect", Precision::kDouble, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 8, 16, 16, 8, 16, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 16, 32, 32, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 16, 8, 8, 8, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 2, 16, 16, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 32, 32, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 32, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 8, 8, 8, 32, 8, 0, 1, 2, 2, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 8, 8, 8, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 8, 8, 8, 8, 8, 0, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 8, 16, 32, 16, 16, 0, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 2, 32, 16, 32, 16, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 2, 32, 16, 32, 16, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 580 "}, Params{ 8, 16, 16, 16, 8, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 16, 8, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GeForce GTX 760 Ti OEM "}, Params{ 2, 8, 8, 16, 16, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 8, 16, 16, 16, 8, 1, 0, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 2, 8, 8, 32, 32, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 2, 16, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 16, 32, 8, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 2, 32, 32, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 32, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp000066400000000000000000000445641463263031500253350ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemm_Direct6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemmDirectComplexDouble = { "XgemmDirect", Precision::kComplexDouble, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 16, 32, 32, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 2, 8, 16, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 16, 8, 16, 8, 8, 0, 0, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 32, 32, 8, 8, 1, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 32, 32, 8, 8, 1, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 2, 8, 8, 16, 16, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 16, 16, 16, 16, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 2, 8, 8, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 2, 16, 16, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 2, 8, 8, 8, 8, 0, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 8, 16, 16, 8, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 8, 16, 8, 8, 8, 0, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 2, 32, 8, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 2, 8, 32, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 2, 16, 8, 16, 16, 0, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 580 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GeForce GTX 760 Ti OEM "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 2, 32, 32, 8, 8, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 2, 32, 32, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 8, 16, 32, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 16, 16, 32, 8, 8, 1, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 2, 32, 32, 16, 16, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 2, 16, 8, 8, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 2, 8, 8, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv/000077500000000000000000000000001463263031500201345ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/xgemv/xgemv.cpp000066400000000000000000000013701463263031500217670ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv' kernels. // // ================================================================================================= #include "database/kernels/xgemv/xgemv.hpp" #include "database/kernels/xgemv/xgemv_16.hpp" #include "database/kernels/xgemv/xgemv_32.hpp" #include "database/kernels/xgemv/xgemv_3232.hpp" #include "database/kernels/xgemv/xgemv_64.hpp" #include "database/kernels/xgemv/xgemv_6464.hpp" CLBlast-1.6.3/src/database/kernels/xgemv/xgemv.hpp000066400000000000000000000014671463263031500220030ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry XgemvHalf; extern const DatabaseEntry XgemvSingle; extern const DatabaseEntry XgemvComplexSingle; extern const DatabaseEntry XgemvDouble; extern const DatabaseEntry XgemvComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv/xgemv_16.hpp000066400000000000000000000224251463263031500223060ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvHalf = { "Xgemv", Precision::kHalf, {"WGS1", "WPT1"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv/xgemv_32.hpp000066400000000000000000000636151463263031500223120ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvSingle = { "Xgemv", Precision::kSingle, {"WGS1", "WPT1"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GT 650M "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv/xgemv_3232.hpp000066400000000000000000000604641463263031500224560ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvComplexSingle = { "Xgemv", Precision::kComplexSingle, {"WGS1", "WPT1"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv/xgemv_64.hpp000066400000000000000000000517561463263031500223220ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvDouble = { "Xgemv", Precision::kDouble, {"WGS1", "WPT1"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) UHD Graphics 620 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv/xgemv_6464.hpp000066400000000000000000000462321463263031500224650ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvComplexDouble = { "Xgemv", Precision::kComplexDouble, {"WGS1", "WPT1"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) UHD Graphics 620 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"Quadro M2000M "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv_fast/000077500000000000000000000000001463263031500211515ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/xgemv_fast/xgemv_fast.cpp000066400000000000000000000014711463263031500240230ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast' kernels. // // ================================================================================================= #include "database/kernels/xgemv_fast/xgemv_fast.hpp" #include "database/kernels/xgemv_fast/xgemv_fast_16.hpp" #include "database/kernels/xgemv_fast/xgemv_fast_32.hpp" #include "database/kernels/xgemv_fast/xgemv_fast_3232.hpp" #include "database/kernels/xgemv_fast/xgemv_fast_64.hpp" #include "database/kernels/xgemv_fast/xgemv_fast_6464.hpp" CLBlast-1.6.3/src/database/kernels/xgemv_fast/xgemv_fast.hpp000066400000000000000000000015201463263031500240230ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry XgemvFastHalf; extern const DatabaseEntry XgemvFastSingle; extern const DatabaseEntry XgemvFastComplexSingle; extern const DatabaseEntry XgemvFastDouble; extern const DatabaseEntry XgemvFastComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv_fast/xgemv_fast_16.hpp000066400000000000000000000224211463263031500243340ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvFastHalf = { "XgemvFast", Precision::kHalf, {"VW2", "WGS2", "WPT2"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 2, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp000066400000000000000000000635551463263031500243470ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvFastSingle = { "XgemvFast", Precision::kSingle, {"VW2", "WGS2", "WPT2"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 1, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 4, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 2, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GT 650M "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 2, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 2, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 2, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp000066400000000000000000000557561463263031500245200ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvFastComplexSingle = { "XgemvFast", Precision::kComplexSingle, {"VW2", "WGS2", "WPT2"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"Quadro M2000M "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 2, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 2, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp000066400000000000000000000517601463263031500243470ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvFastDouble = { "XgemvFast", Precision::kDouble, {"VW2", "WGS2", "WPT2"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 4, 256, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K20m "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla K40m "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 980 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN X "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp000066400000000000000000000455611463263031500245230ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvFastComplexDouble = { "XgemvFast", Precision::kComplexDouble, {"VW2", "WGS2", "WPT2"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 2, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) UHD Graphics 620 "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) Many Integrated Core Acceleration Card "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"Quadro M2000M "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv_fast_rot/000077500000000000000000000000001463263031500220355ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot.cpp000066400000000000000000000015551463263031500255760ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast_Rot' kernels. // // ================================================================================================= #include "database/kernels/xgemv_fast_rot/xgemv_fast_rot.hpp" #include "database/kernels/xgemv_fast_rot/xgemv_fast_rot_16.hpp" #include "database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp" #include "database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp" #include "database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp" #include "database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp" CLBlast-1.6.3/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot.hpp000066400000000000000000000015431463263031500256000ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast_Rot' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry XgemvFastRotHalf; extern const DatabaseEntry XgemvFastRotSingle; extern const DatabaseEntry XgemvFastRotComplexSingle; extern const DatabaseEntry XgemvFastRotDouble; extern const DatabaseEntry XgemvFastRotComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_16.hpp000066400000000000000000000223051463263031500261050ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast_Rot16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvFastRotHalf = { "XgemvFastRot", Precision::kHalf, {"VW3", "WGS3", "WPT3"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 4, 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 2, 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 128, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 8, 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 2, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 2, 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 2, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp000066400000000000000000000561751463263031500261170ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast_Rot32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvFastRotSingle = { "XgemvFastRot", Precision::kSingle, {"VW3", "WGS3", "WPT3"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 8, 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 8, 128, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 128, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 8, 128, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 128, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 4, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 2, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 2, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 4, 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 8, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 580 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GeForce GT 650M "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 4, 128, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 8, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 8, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp000066400000000000000000000514021463263031500262500ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast_Rot3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvFastRotComplexSingle = { "XgemvFastRot", Precision::kComplexSingle, {"VW3", "WGS3", "WPT3"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 4, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 8, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 4, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 4, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 4, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 8, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 4, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 1, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 2, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 1, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 580 "}, Params{ 1, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GeForce GTX 760 Ti OEM "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"Quadro M2000M "}, Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 1, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 2, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 2, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 2, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 2, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 1, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 2, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 2, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 2, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp000066400000000000000000000443551463263031500261210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast_Rot64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvFastRotDouble = { "XgemvFastRot", Precision::kDouble, {"VW3", "WGS3", "WPT3"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 4, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 1, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 4, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 1, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) UHD Graphics 620 "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 580 "}, Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GeForce GTX 760 Ti OEM "}, Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 8, 128, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 2, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 2, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 4, 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp000066400000000000000000000420271463263031500262650ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xgemv_Fast_Rot6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgemvFastRotComplexDouble = { "XgemvFastRot", Precision::kComplexDouble, {"VW3", "WGS3", "WPT3"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon R9 Fury X "}, Params{ 4, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 4, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 1, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 1, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 1, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 8, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 8, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) UHD Graphics 620 "}, Params{ 2, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 580 "}, Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GeForce GTX 760 Ti OEM "}, Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 2, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 4, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"Quadro M2000M "}, Params{ 4, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 4, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 1, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 1, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 1, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 1, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 1, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 2, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 2, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 1, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 4, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 1, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 2, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 2, 16, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xger/000077500000000000000000000000001463263031500177535ustar00rootroot00000000000000CLBlast-1.6.3/src/database/kernels/xger/xger.cpp000066400000000000000000000013531463263031500214260ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xger' kernels. // // ================================================================================================= #include "database/kernels/xger/xger.hpp" #include "database/kernels/xger/xger_16.hpp" #include "database/kernels/xger/xger_32.hpp" #include "database/kernels/xger/xger_3232.hpp" #include "database/kernels/xger/xger_64.hpp" #include "database/kernels/xger/xger_6464.hpp" CLBlast-1.6.3/src/database/kernels/xger/xger.hpp000066400000000000000000000014611463263031500214330ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xger' kernels. // // ================================================================================================= #include "database/database_structure.hpp" namespace clblast { namespace database { extern const DatabaseEntry XgerHalf; extern const DatabaseEntry XgerSingle; extern const DatabaseEntry XgerComplexSingle; extern const DatabaseEntry XgerDouble; extern const DatabaseEntry XgerComplexDouble; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xger/xger_16.hpp000066400000000000000000000241421463263031500217420ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xger16' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgerHalf = { "Xger", Precision::kHalf, {"WGS1", "WGS2", "WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 64, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 16, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-G57 MC2 r0p1 "}, Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 4, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 128, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 32, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 64, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 64, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 64, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 16, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 256, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 64, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 64, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xger/xger_32.hpp000066400000000000000000000634031463263031500217430ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xger32' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgerSingle = { "Xger", Precision::kSingle, {"WGS1", "WGS2", "WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 16, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 128, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 64, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T628 "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Mali-T760 "}, Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 4, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 256, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 512, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 128, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 256, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 256, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 256, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 512, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 256, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 16, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 256, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 16, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 16, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 64, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 512, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 128, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GT 650M "}, Params{ 32, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 512, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 512, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 64, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 32, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 64, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 16, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 64, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 512, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 256, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 4, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 16, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 8, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 4, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 16, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 64, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 730", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 3.0 Adreno(TM) 740", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xger/xger_3232.hpp000066400000000000000000000614031463263031500221060ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xger3232' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgerComplexSingle = { "Xger", Precision::kComplexSingle, {"WGS1", "WGS2", "WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 128, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 4, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Turks", { { Name{"AMD Radeon HD 6770M "}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vancouver", { { Name{"ATI Radeon HD 6750M "}, Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 4, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 64, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 32, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 128, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Apple GPUs kDeviceTypeGPU, "Apple", { { "default", { { Name{"Apple M1 "}, Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Apple M2 Max "}, Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Imagination Technologies GPUs kDeviceTypeGPU, "Imagination Technologies", { { "default", { { Name{"PowerVR B-Series BXE-4-32 "}, Params{ 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 128, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 512, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 512, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 256, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 256, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 512, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 256, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 128, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) Arc(TM) A750 Graphics "}, Params{ 32, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Arc(TM) A770 Graphics "}, Params{ 4, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Iris(R) Xe Graphics "}, Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) RaptorLake-S Mobile Graphics Controller "}, Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 770 "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { { "default", { { Name{"Intel(R) FPGA Emulation Device "}, Params{ 128, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 16, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 32, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 128, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 16, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 128, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 256, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 128, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 16, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // QUALCOMM GPUs kDeviceTypeGPU, "QUALCOMM", { { "default", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 128, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "OpenCL C 2.0 Adreno(TM) 640", { { Name{"QUALCOMM Adreno(TM) "}, Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xger/xger_64.hpp000066400000000000000000000514131463263031500217460ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xger64' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgerDouble = { "Xger", Precision::kDouble, {"WGS1", "WGS2", "WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 64, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 128, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 64, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 8, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 256, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 512, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 256, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 512, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 256, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 512, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 512, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 128, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 32, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 128, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 256, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 32, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 16, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 32, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 256, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 8, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 4, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/database/kernels/xger/xger_6464.hpp000066400000000000000000000513511463263031500221210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It // is auto-generated by the 'scripts/database/database.py' Python script. // // This file populates the database with best-found tuning parameters for the 'Xger6464' kernels. // // ================================================================================================= namespace clblast { namespace database { const DatabaseEntry XgerComplexDouble = { "Xger", Precision::kComplexDouble, {"WGS1", "WGS2", "WPT"}, { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "Ellesmere", { { Name{"AMD Radeon RX 480 "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 580 2048SP "}, Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX590 GME "}, Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Fiji", { { Name{"AMD Radeon 500 Series "}, Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 Fury X "}, Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Hawaii", { { Name{"AMD FirePro W8100 "}, Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon R9 290X "}, Params{ 128, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Oland", { { Name{"Oland "}, Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Pitcairn", { { Name{"AMD Radeon R9 270X "}, Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tahiti", { { Name{"AMD Radeon HD 7970 "}, Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Tonga", { { Name{"AMD Radeon R9 380 "}, Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "Vega", { { Name{"Radeon RX Vega "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { Name{"AMD Radeon Pro 450 Compute Engine "}, Params{ 256, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1010:xnack-", { { Name{"AMD Radeon RX 5700 "}, Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 5700 XT "}, Params{ 16, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1030", { { Name{"AMD Radeon RX 6800 XT "}, Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon RX 6900 XT "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1031", { { Name{"AMD Radeon RX 6700 XT "}, Params{ 8, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1032", { { Name{"AMD Radeon RX 6600 XT "}, Params{ 8, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1034", { { Name{"AMD Radeon RX 6500 XT "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1035", { { Name{"AMD Radeon Graphics "}, Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1100", { { Name{"Radeon RX 7900 XTX "}, Params{ 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1101", { { Name{"AMD Radeon RX 7800 XT "}, Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1102", { { Name{"AMD Radeon RX 7600 "}, Params{ 64, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx1103", { { Name{"AMD Radeon 780M Graphics "}, Params{ 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx902", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"AMD Radeon(TM) RX Vega 10 Graphics "}, Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx906:sramecc+:xnack-", { { Name{"AMD Radeon VII "}, Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "gfx90c", { { Name{"AMD Radeon(TM) Graphics "}, Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // ARM GPUs kDeviceTypeGPU, "ARM", { { "default", { { Name{"Mali-T760 "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { { "default", { { Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 128, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"12th Gen Intel(R) Core(TM) i7-12700H "}, Params{ 8, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-4590S CPU @ 3.00GHz "}, Params{ 128, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 512, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 256, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 512, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 256, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz "}, Params{ 256, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz "}, Params{ 256, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz "}, Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) UHD Graphics 620 "}, Params{ 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "SM2.0", { { Name{"GeForce GTX 480 "}, Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 580 "}, Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.0", { { Name{"GRID K520 "}, Params{ 16, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 670 "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 128, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { { Name{"GeForce 920A "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX TITAN Black "}, Params{ 16, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GT 730 "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.0", { { Name{"GeForce 920MX "}, Params{ 256, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 "}, Params{ 8, 32, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 750 Ti "}, Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro M2000M "}, Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM5.2", { { Name{"GeForce GTX 970 "}, Params{ 256, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.0", { { Name{"Tesla P100-PCIE-16GB "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM6.1", { { Name{"GeForce GTX 1070 "}, Params{ 8, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1070 Ti "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 "}, Params{ 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 1080 Ti "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1060 6GB "}, Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX150 "}, Params{ 512, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN X (Pascal) "}, Params{ 4, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla P4 "}, Params{ 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.0", { { Name{"Quadro GV100 "}, Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla V100-PCIE-16GB "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM7.5", { { Name{"GeForce GTX 1650 "}, Params{ 16, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 SUPER "}, Params{ 64, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce GTX 1650 Ti "}, Params{ 32, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce MX450 "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2060 "}, Params{ 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 SUPER "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 Super "}, Params{ 64, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2070 with Max-Q Design "}, Params{ 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 Ti "}, Params{ 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 2080 with Max-Q Design "}, Params{ 4, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Quadro T2000 "}, Params{ 4, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"TITAN RTX "}, Params{ 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Tesla T4 "}, Params{ 4, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.0", { { Name{"A100-PCIE-40GB "}, Params{ 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.6", { { Name{"NVIDIA GeForce RTX 3050 Ti Laptop GPU "}, Params{ 8, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3060 Laptop GPU "}, Params{ 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3070 Ti Laptop GPU "}, Params{ 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Laptop GPU "}, Params{ 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3080 Ti "}, Params{ 8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 3090 "}, Params{ 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA RTX A6000 "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM8.9", { { Name{"NVIDIA GeForce RTX 4050 Laptop GPU "}, Params{ 4, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 "}, Params{ 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4060 Ti "}, Params{ 4, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Laptop GPU "}, Params{ 32, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4070 Ti "}, Params{ 4, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4080 "}, Params{ 8, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"NVIDIA GeForce RTX 4090 "}, Params{ 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "default", { { kDeviceNameDefault , Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault , Params{ 4, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, } }; } // namespace database } // namespace clblast CLBlast-1.6.3/src/kernel_preprocessor.cpp000066400000000000000000000615751463263031500204070ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the OpenCL kernel preprocessor (see the header for more information). // // Restrictions: // - Use comments only single-line "//" style, not "/*" and "*/" // - Don't use strings with characters parsed (e.g. '//', '}', '#ifdef') // - Supports conditionals: #if #ifdef #ifndef #else #elif #endif // - ...with the operators: == // - "#pragma unroll" requires next loop in the form "for (int w = 0; w < 4; w += 1) {" // The above also requires the spaces in that exact form // - The loop variable should be a unique string within the code in the for-loop body (e.g. don't // use 'i' or 'w' but rather '_w' or a longer name. // - The pragma "#pragma promote_to_registers" unrolls an array into multiple scalar values. The // name of this scalar should be unique (see above). // // ================================================================================================= #include #include #include #include #include #include #include "kernel_preprocessor.hpp" namespace clblast { // ================================================================================================= struct compare_longer_string { bool operator() (const std::string &lhs, const std::string &rhs) const { if (lhs.size() > rhs.size()) { return true; } if (lhs.size() < rhs.size()) { return false; } return lhs < rhs; } }; using DefinesIntMap = std::map; using DefinesStringMap = std::map>; void RaiseError(const std::string& source_line, const std::string& exception_message) { printf("[OpenCL pre-processor] Error in source line: %s\n", source_line.c_str()); throw Error(exception_message); } // ================================================================================================= bool HasOnlyDigits(const std::string& str) { if (str == "") { return false; } return str.find_first_not_of(" 0123456789") == std::string::npos; } // Simple unsigned integer math parser int ParseMath(const std::string& str) { // Handles brackets if (str.find(")") != std::string::npos) { const auto split_close = split(str, ')'); const auto split_end = split(split_close[0], '('); if (split_end.size() < 2) { RaiseError(str, "Mismatching brackets #0"); } const auto bracket_contents = ParseMath(split_end[split_end.size() - 1]); auto before = std::string{}; for (auto i = size_t{0}; i < split_end.size() - 1; ++i) { before += split_end[i]; if (i != split_end.size() - 2) { before += "("; } } auto after = std::string{}; for (auto i = size_t{1}; i < split_close.size(); ++i) { after += split_close[i]; if (i != split_close.size() - 1) { after += ")"; } } return ParseMath(before + ToString(bracket_contents) + after); } // Handles addition const auto split_add = split(str, '+'); if (split_add.size() == 2) { const auto lhs = ParseMath(split_add[0]); const auto rhs = ParseMath(split_add[1]); if (lhs == -1 || rhs == -1) { return -1; } return lhs + rhs; } // Handles multiplication const auto split_mul = split(str, '*'); if (split_mul.size() == 2) { const auto lhs = ParseMath(split_mul[0]); const auto rhs = ParseMath(split_mul[1]); if (lhs == -1 || rhs == -1) { return -1; } return lhs * rhs; } // Handles division const auto split_div = split(str, '/'); if (split_div.size() == 2) { const auto lhs = ParseMath(split_div[0]); const auto rhs = ParseMath(split_div[1]); if (lhs == -1 || rhs == -1) { return -1; } return lhs / rhs; } // Handles the digits if (HasOnlyDigits(str)) { return std::stoi(str); } return -1; // error value } // Converts a string to an integer. The source line is printed in case an exception is raised. size_t StringToDigit(const std::string& str, const std::string& source_line) { const auto result = ParseMath(str); if (result == -1) { RaiseError(source_line, "Not a digit: " + str); } return static_cast(result); } // ================================================================================================= void FindReplace(std::string &subject, const std::string &search, const std::string &replace) { auto pos = size_t{0}; while ((pos = subject.find(search, pos)) != std::string::npos) { subject.replace(pos, search.length(), replace); pos += replace.length(); } } void SubstituteDefines(const DefinesIntMap& defines, std::string& source_string) { for (const auto &define : defines) { FindReplace(source_string, define.first, std::to_string(define.second)); } } bool EvaluateCondition(std::string condition, const DefinesIntMap &defines, const DefinesStringMap &defines_string) { // Replace macros in the string SubstituteDefines(defines, condition); // Process the or sign const auto or_pos = condition.find(" || "); if (or_pos != std::string::npos) { const auto left = condition.substr(0, or_pos); const auto right = condition.substr(or_pos + 4); return EvaluateCondition(left, defines, defines_string) || EvaluateCondition(right, defines, defines_string); } // Process the and sign const auto and_pos = condition.find(" && "); if (and_pos != std::string::npos) { const auto left = condition.substr(0, and_pos); const auto right = condition.substr(and_pos + 4); return EvaluateCondition(left, defines, defines_string) && EvaluateCondition(right, defines, defines_string); } // Process the !defined() construct const auto not_defined_pos = condition.find("!defined("); if (not_defined_pos != std::string::npos) { const auto contents = condition.substr(not_defined_pos + 9); const auto not_defined_split = split(contents, ')'); const auto not_defined_val = not_defined_split[0]; return (defines_string.find(not_defined_val) == defines_string.end()); } // Process the defined() construct const auto defined_pos = condition.find("defined("); if (defined_pos != std::string::npos) { const auto contents = condition.substr(defined_pos + 8); const auto defined_split = split(contents, ')'); const auto defined_val = defined_split[0]; return (defines_string.find(defined_val) != defines_string.end()); } // Process the equality sign const auto equal_pos = condition.find(" == "); if (equal_pos != std::string::npos) { const auto left = condition.substr(0, equal_pos); const auto right = condition.substr(equal_pos + 4); return (left == right); } // Process the not equal sign const auto not_equal_pos = condition.find(" != "); if (not_equal_pos != std::string::npos) { const auto left = condition.substr(0, not_equal_pos); const auto right = condition.substr(not_equal_pos + 4); return (left != right); } // Process the smaller than sign const auto smaller_than_pos = condition.find(" < "); if (smaller_than_pos != std::string::npos) { const auto left = condition.substr(0, smaller_than_pos); const auto right = condition.substr(smaller_than_pos + 3); return (left < right); } // Process the larger than sign const auto larger_than_pos = condition.find(" > "); if (larger_than_pos != std::string::npos) { const auto left = condition.substr(0, larger_than_pos); const auto right = condition.substr(larger_than_pos + 3); return (left > right); } printf("Warning unknown condition: %s\n", condition.c_str()); return false; // unknown error } // ================================================================================================= // Array to register promotion, e.g. arr[w] to {arr_0, arr_1} void ArrayToRegister(std::string &source_line, const DefinesIntMap& defines, const std::unordered_map& arrays_to_registers, const size_t num_brackets) { for (const auto& array_name_map : arrays_to_registers) { // only if marked to be promoted // Outside of a function if (num_brackets == 0) { // Case 1: argument in a function declaration (e.g. 'void func(const float arr[2])') const auto array_pos = source_line.find(array_name_map.first + "["); if (array_pos != std::string::npos) { SubstituteDefines(defines, source_line); // Finds the full array declaration (e.g. 'const float arr[2]') const auto left_split = split(source_line, '('); auto arguments = left_split.size() >= 2 ? left_split[1] : source_line; const auto right_split = split(arguments, ')'); arguments = right_split.size() >= 1 ? right_split[0] : arguments; const auto comma_split = split(arguments, ','); for (auto j = size_t{0}; j < comma_split.size(); ++j) { if (comma_split[j].find(array_name_map.first + "[") != std::string::npos) { // Retrieves the array index const auto left_square_split = split(comma_split[j], '['); if (left_square_split.size() < 2) { RaiseError(source_line, "Mis-formatted array declaration #A"); } const auto right_square_split = split(left_square_split[1], ']'); if (right_square_split.size() < 1) { RaiseError(source_line, "Mis-formatted array declaration #B"); } auto array_index_string = right_square_split[0]; const auto array_index = StringToDigit(array_index_string, source_line); // Creates the new string auto replacement = std::string{}; for (auto index = size_t{0}; index < array_index; ++index) { replacement += left_square_split[0] + "_" + ToString(index); if (index != array_index - 1) { replacement += ","; } } // Performs the actual replacement FindReplace(source_line, comma_split[j], replacement); } } } } // Inside a function else { auto array_pos = source_line.find(array_name_map.first + "["); // Case 2: passed to another function (e.g. 'func(arr)') if (array_pos == std::string::npos) { // assumes case 2 and case 3 (below) cannot occur in one line auto bracket_split = split(source_line, '('); if (bracket_split.size() >= 2) { auto replacement = std::string{}; for (auto i = size_t{0}; i < array_name_map.second; ++i) { replacement += array_name_map.first + "_" + ToString(i); if (i != array_name_map.second - 1) { replacement += ", "; } } FindReplace(source_line, array_name_map.first, replacement); } } // Case 2: used as an array (e.g. 'arr[w]') while (array_pos != std::string::npos) { // Retrieves the array index const auto loop_remainder = source_line.substr(array_pos); const auto loop_split = split(split(loop_remainder, '[')[1], ']'); if (loop_split.size() < 2) { RaiseError(source_line, "Mis-formatted array declaration #C"); } auto array_index_string = loop_split[0]; // Replaces the array with a register value SubstituteDefines(defines, array_index_string); const auto array_index = StringToDigit(array_index_string, source_line); FindReplace(source_line, array_name_map.first + "[" + loop_split[0] + "]", array_name_map.first + "_" + ToString(array_index)); // Performs an extra substitution if this array occurs another time in this line array_pos = source_line.find(array_name_map.first + "["); } } } } // ================================================================================================= // First pass: detect defines and comments std::vector PreprocessDefinesAndComments(const std::string& source, DefinesIntMap& defines_int) { auto lines = std::vector(); auto defines_string = DefinesStringMap(); // Parse the input string into a vector of lines const auto max_depth_defines = 30; auto disabled = std::vector(max_depth_defines, 0); auto depth = size_t{0}; std::stringstream source_stream(source); auto line = std::string{""}; while (std::getline(source_stream, line)) { //printf("[@%zu] disabled=%d '%s'\n", depth, disabled[depth], line.c_str()); // Decide whether or not to remain in 'disabled' mode // {0 => enabled, 1 => disabled, but could become enabled again later, 2 => disabled until #endif if (line.find("#endif") != std::string::npos) { disabled[depth] = 0; } if (line.find("#elif") != std::string::npos || line.find("#else") != std::string::npos) { if (disabled[depth] == 0) { disabled[depth] = 2; } // was enabled, now disabled until #endif if (disabled[depth] == 1) { disabled[depth] = 0; } // was disabled, now potentially enabled again } // Measures the depth of pre-processor defines if ((line.find("#ifndef ") != std::string::npos) || (line.find("#ifdef ") != std::string::npos) || (line.find("#if ") != std::string::npos)) { depth++; if (depth >= max_depth_defines) { throw Error("too deep define nest"); } } if (line.find("#endif") != std::string::npos) { if (depth == 0) { throw Error("incorrect define nest"); } depth--; } // Verifies whether this level or any level below is disabled auto is_disabled = false; for (auto d = size_t{0}; d <= depth; ++d) { if (disabled[d] >= 1) { is_disabled = true; } } // Not in a disabled-block if (!is_disabled) { // Skip empty lines if (line == "") { continue; } // Single line comments const auto comment_pos = line.find("//"); if (comment_pos != std::string::npos) { if (comment_pos == 0) { continue; } line.erase(comment_pos); } // Detect #define macros const auto define_pos = line.find("#define "); if (define_pos != std::string::npos) { const auto define = line.substr(define_pos + 8); // length of "#define " const auto value_pos = define.find(" "); auto value = define.substr(value_pos + 1); const auto name = define.substr(0, value_pos); SubstituteDefines(defines_int, value); const auto value_int = ParseMath(value); if (value_int != -1) { defines_int.emplace(name, value_int); } defines_string.emplace(name, value); } // Detect #undef macros // When USE_SUBGROUP_SHUFFLING is set, but kernel parameters do not satisfy the conditions // for subgroup shuffle, USE_SUBGROUP_SHUFFLING needs to be unset in preprocessing // to avoid GEMM kernel errors. See src/kernels/level3/xgemm_part1.opencl line 142. // In this preprocessor, macros are not redefined because of behavior defined by std::map::emplace const auto undef_pos = line.find("#undef "); if (undef_pos != std::string::npos) { const auto undef = line.substr(undef_pos + 7); // length of "#undef " // checks if definition is found in defines_int and/or defines_string, then removes the definition auto int_undef = defines_int.find(undef); if (int_undef != defines_int.end()){ defines_int.erase(int_undef); } auto string_undef = defines_string.find(undef); if (string_undef != defines_string.end()){ defines_string.erase(string_undef); } } // Detect #ifndef blocks const auto ifndef_pos = line.find("#ifndef "); if (ifndef_pos != std::string::npos) { const auto define = line.substr(ifndef_pos + 8); // length of "#ifndef " if (defines_string.find(define) != defines_string.end()) { disabled[depth] = 1; } continue; } // Detect #ifdef blocks const auto ifdef_pos = line.find("#ifdef "); if (ifdef_pos != std::string::npos) { const auto define = line.substr(ifdef_pos + 7); // length of "#ifdef " if (defines_string.find(define) == defines_string.end()) { disabled[depth] = 1; } continue; } // Detect #if blocks const auto if_pos = line.find("#if "); if (if_pos != std::string::npos) { const auto condition = line.substr(if_pos + 4); // length of "#if " if (!EvaluateCondition(condition, defines_int, defines_string)) { disabled[depth] = 1; } continue; } // Detect #elif blocks const auto elif_pos = line.find("#elif "); if (elif_pos != std::string::npos) { const auto condition = line.substr(elif_pos + 6); // length of "#elif " if (!EvaluateCondition(condition, defines_int, defines_string)) { disabled[depth] = 1; } continue; } // Discard #else and #endif statements if (line.find("#endif") != std::string::npos || line.find("#else") != std::string::npos) { continue; } lines.push_back(line); } } return lines; } // ================================================================================================= // Second pass: detect array-to-register promotion pragma's and replace declarations & function calls std::vector PreprocessUnrollLoops(const std::vector& source_lines, const DefinesIntMap& defines, std::unordered_map& arrays_to_registers) { auto lines = std::vector(); auto brackets = size_t{0}; auto promote_next_array_to_registers = false; for (auto line_id = size_t{0}; line_id < source_lines.size(); ++line_id) { auto line = source_lines[line_id]; // Detect #pragma promote_to_registers directives (unofficial pragma) if (line.find("#pragma promote_to_registers") != std::string::npos) { promote_next_array_to_registers = true; continue; } // Brackets brackets += std::count(line.begin(), line.end(), '{'); brackets -= std::count(line.begin(), line.end(), '}'); // Promote array declarations to registers if (promote_next_array_to_registers) { promote_next_array_to_registers = false; const auto line_split1 = split(line, '['); if (line_split1.size() != 2) { RaiseError(line, "Mis-formatted array declaration #0"); } const auto line_split2 = split(line_split1[1], ']'); if (line_split2.size() != 2) { RaiseError(line, "Mis-formatted array declaration #1"); } auto array_size_string = line_split2[0]; SubstituteDefines(defines, array_size_string); const auto array_size = StringToDigit(array_size_string, line); for (auto loop_iter = size_t{0}; loop_iter < array_size; ++loop_iter) { lines.emplace_back(line_split1[0] + "_" + ToString(loop_iter) + line_split2[1]); } // Stores the array name const auto array_name_split = split(line_split1[0], ' '); if (array_name_split.size() < 2) { RaiseError(line, "Mis-formatted array declaration #2"); } const auto array_name = array_name_split[array_name_split.size() - 1]; arrays_to_registers[array_name] = array_size; // TODO: bracket count not used currently for scope checking continue; } // Regular line lines.emplace_back(line); } return lines; } // ================================================================================================= // Third pass: unroll loops and perform actual array-to-register promotion std::vector PreprocessUnrollLoops(const std::vector& source_lines, const DefinesIntMap& defines, std::unordered_map& arrays_to_registers, const bool array_to_register_promotion) { auto lines = std::vector(); auto brackets = size_t{0}; auto unroll_next_loop = false; for (auto line_id = size_t{0}; line_id < source_lines.size(); ++line_id) { auto line = source_lines[line_id]; // Detect #pragma unroll directives if (line.find("#pragma unroll") != std::string::npos) { unroll_next_loop = true; continue; } // Brackets const auto num_brackets_before = brackets; brackets += std::count(line.begin(), line.end(), '{'); brackets -= std::count(line.begin(), line.end(), '}'); // Loop unrolling assuming it to be in the form "for (int w = 0; w < 4; w += 1) {" if (unroll_next_loop) { unroll_next_loop = false; // Parses loop structure const auto for_pos = line.find("for ("); if (for_pos == std::string::npos) { RaiseError(line, "Mis-formatted for-loop #0"); } const auto remainder = line.substr(for_pos + 5); // length of "for (" const auto line_split = split(remainder, ' '); if (line_split.size() != 11) { RaiseError(line, "Mis-formatted for-loop #1"); } // Retrieves loop information (and checks for assumptions) const auto variable_type = line_split[0]; const auto variable_name = line_split[1]; if (variable_name != line_split[4]) { RaiseError(line, "Mis-formatted for-loop #2"); } if (variable_name != line_split[7]) { RaiseError(line, "Mis-formatted for-loop #3"); } auto loop_start_string = line_split[3]; auto loop_end_string = line_split[6]; auto loop_increment_string = line_split[9]; remove_character(loop_start_string, ';'); remove_character(loop_end_string, ';'); remove_character(loop_increment_string, ')'); // Parses loop information SubstituteDefines(defines, loop_start_string); SubstituteDefines(defines, loop_end_string); SubstituteDefines(defines, loop_increment_string); const auto loop_start = StringToDigit(loop_start_string, line); const auto loop_end = StringToDigit(loop_end_string, line); const auto loop_increment = StringToDigit(loop_increment_string, line); auto indent = std::string{""}; for (auto i = size_t{0}; i < for_pos; ++i) { indent += " "; } // Start of the loop line_id++; const auto loop_num_brackets = brackets; const auto line_id_start = line_id; for (auto loop_iter = loop_start; loop_iter < loop_end; loop_iter += loop_increment) { line_id = line_id_start; brackets = loop_num_brackets; lines.emplace_back(indent + "{"); // Body of the loop //lines.emplace_back(indent + " " + variable_type + " " + variable_name + " = " + ToString(loop_iter) + ";"); while (brackets >= loop_num_brackets) { auto loop_line = source_lines[line_id]; brackets += std::count(loop_line.begin(), loop_line.end(), '{'); brackets -= std::count(loop_line.begin(), loop_line.end(), '}'); // Regular variable substitution FindReplace(loop_line, variable_name, ToString(loop_iter)); // Array to register promotion if (array_to_register_promotion) { ArrayToRegister(loop_line, defines, arrays_to_registers, num_brackets_before); } lines.emplace_back(loop_line); line_id++; } line_id--; } } else { // Array to register promotion if (array_to_register_promotion) { ArrayToRegister(line, defines, arrays_to_registers, num_brackets_before); } lines.emplace_back(line); } } return lines; } // ================================================================================================= std::string PreprocessKernelSource(const std::string& kernel_source) { // Retrieves the defines and removes comments from the source lines auto defines = DefinesIntMap(); auto lines = PreprocessDefinesAndComments(kernel_source, defines); // Unrolls loops (single level each call) auto arrays_to_registers = std::unordered_map(); lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers); lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false); lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false); lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false); lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false); lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, true); // Gather the results auto processed_kernel = std::string{""}; for (const auto& line : lines) { processed_kernel += line + "\n"; } // Debugging if (false) { for (auto i = size_t{0}; i < lines.size(); ++i) { printf("[%zu] %s\n", i, lines[i].c_str()); } } return processed_kernel; } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/kernel_preprocessor.hpp000066400000000000000000000024121463263031500203750ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the a simple pre-processor for the OpenCL kernels. This pre-processor is used // in cases where the vendor's OpenCL compiler falls short in loop unrolling and array-to-register // promotion. This pre-processor is specific for the CLBlast code making many assumptions. // // ================================================================================================= #ifndef CLBLAST_KERNEL_PREPROCESSOR_H_ #define CLBLAST_KERNEL_PREPROCESSOR_H_ #include #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= std::string PreprocessKernelSource(const std::string& kernel_source); // ================================================================================================= } // namespace clblast // CLBLAST_KERNEL_PREPROCESSOR_H_ #endif CLBlast-1.6.3/src/kernels/000077500000000000000000000000001463263031500152425ustar00rootroot00000000000000CLBlast-1.6.3/src/kernels/common.opencl000066400000000000000000000224121463263031500177350ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the common defines and type-defs for the CLBlast OpenCL kernels. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this file is used outside of the CLBlast library. #ifndef PRECISION #define PRECISION 32 // Data-types: half, single or double precision, complex or regular #endif // ================================================================================================= #ifndef CUDA // Enable support for half-precision #if PRECISION == 16 #pragma OPENCL EXTENSION cl_khr_fp16: enable #endif // Enable support for double-precision #if PRECISION == 64 || PRECISION == 6464 #pragma OPENCL EXTENSION cl_khr_fp64: enable #endif #endif // Half-precision #if PRECISION == 16 typedef half real; typedef half2 real2; typedef half4 real4; typedef half8 real8; typedef half16 real16; #define ZERO 0 #define ONE 1 #define SMALLEST -1.0e14 // Single-precision #elif PRECISION == 32 typedef float real; typedef float2 real2; typedef float4 real4; typedef float8 real8; typedef float16 real16; #define ZERO 0.0f #define ONE 1.0f #define SMALLEST -1.0e37f // Double-precision #elif PRECISION == 64 typedef double real; typedef double2 real2; typedef double4 real4; typedef double8 real8; typedef double16 real16; #define ZERO 0.0 #define ONE 1.0 #define SMALLEST -1.0e37 // Complex single-precision #elif PRECISION == 3232 typedef float2 real; typedef struct cfloat2 {real x; real y;} real2; typedef struct cfloat4 {real x; real y; real z; real w;} real4; typedef struct cfloat8 {real s0; real s1; real s2; real s3; real s4; real s5; real s6; real s7;} real8; typedef struct cfloat16 {real s0; real s1; real s2; real s3; real s4; real s5; real s6; real s7; real s8; real s9; real sA; real sB; real sC; real sD; real sE; real sF;} real16; #define ZERO 0.0f #define ONE 1.0f #define SMALLEST -1.0e37f // Complex double-precision #elif PRECISION == 6464 typedef double2 real; typedef struct cdouble2 {real x; real y;} real2; typedef struct cdouble4 {real x; real y; real z; real w;} real4; typedef struct cdouble8 {real s0; real s1; real s2; real s3; real s4; real s5; real s6; real s7;} real8; typedef struct cdouble16 {real s0; real s1; real s2; real s3; real s4; real s5; real s6; real s7; real s8; real s9; real sA; real sB; real sC; real sD; real sE; real sF;} real16; #define ZERO 0.0 #define ONE 1.0 #define SMALLEST -1.0e37 #endif // Single-element version of a complex number #if PRECISION == 3232 typedef float singlereal; #elif PRECISION == 6464 typedef double singlereal; #else typedef real singlereal; #endif // Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no // conversion, but half-precision is not supported as kernel argument so it is converted from float. #if PRECISION == 16 typedef float real_arg; #define GetRealArg(x) (half)x #else typedef real real_arg; #define GetRealArg(x) x #endif // Pointers to local memory objects (using a define because CUDA doesn't need them) #ifndef LOCAL_PTR #define LOCAL_PTR __local #endif // ================================================================================================= // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific // devices, this is enabled (see src/routine.cpp). #ifndef USE_CL_MAD #define USE_CL_MAD 0 #endif // By default the workgroup size requirement is enabled. For Qualcomm devices the workgroup size // requirement results in worse performance and is disabled (src/utilities/compile.cpp) #ifndef RELAX_WORKGROUP_SIZE #define RELAX_WORKGROUP_SIZE 0 #endif // Sets a variable to zero #if PRECISION == 3232 || PRECISION == 6464 #define SetToZero(a) a.x = ZERO; a.y = ZERO #else #define SetToZero(a) a = ZERO #endif // Sets a variable to zero (only the imaginary part) #if PRECISION == 3232 || PRECISION == 6464 #define ImagToZero(a) a.y = ZERO #else #define ImagToZero(a) #endif // Sets a variable to one #if PRECISION == 3232 || PRECISION == 6464 #define SetToOne(a) a.x = ONE; a.y = ZERO #else #define SetToOne(a) a = ONE #endif // Determines whether a variable is zero #if PRECISION == 3232 || PRECISION == 6464 #define IsZero(a) ((a.x == ZERO) && (a.y == ZERO)) #else #define IsZero(a) (a == ZERO) #endif // The absolute value (component-wise) #if PRECISION == 3232 || PRECISION == 6464 #define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y) #else #define AbsoluteValue(value) value = fabs(value) #endif // Negation (component-wise) #if PRECISION == 3232 || PRECISION == 6464 #define Negate(value) value.x = -(value.x); value.y = -(value.y) #else #define Negate(value) value = -(value) #endif // Adds two complex variables #if PRECISION == 3232 || PRECISION == 6464 #define Add(c,a,b) c.x = a.x + b.x; c.y = a.y + b.y #else #define Add(c,a,b) c = a + b #endif // Subtracts two complex variables #if PRECISION == 3232 || PRECISION == 6464 #define Subtract(c,a,b) c.x = a.x - b.x; c.y = a.y - b.y #else #define Subtract(c,a,b) c = a - b #endif // Multiply two complex variables (used in the defines below) #if PRECISION == 3232 || PRECISION == 6464 #define MulReal(a,b) a.x*b.x - a.y*b.y #define MulImag(a,b) a.x*b.y + a.y*b.x #endif // The scalar multiply function #if PRECISION == 3232 || PRECISION == 6464 #define Multiply(c,a,b) c.x = MulReal(a,b); c.y = MulImag(a,b) #else #define Multiply(c,a,b) c = a * b #endif // The scalar multiply-add function #if PRECISION == 3232 || PRECISION == 6464 #define MultiplyAdd(c,a,b) c.x += MulReal(a,b); c.y += MulImag(a,b) #else #if USE_CL_MAD == 1 #define MultiplyAdd(c,a,b) c = mad(a, b, c) #else #define MultiplyAdd(c,a,b) c += a * b #endif #endif // The scalar multiply-subtract function #if PRECISION == 3232 || PRECISION == 6464 #define MultiplySubtract(c,a,b) c.x -= MulReal(a,b); c.y -= MulImag(a,b) #else #define MultiplySubtract(c,a,b) c -= a * b #endif // The scalar division function: full division #if PRECISION == 3232 || PRECISION == 6464 #define DivideFull(c,a,b) singlereal num_x = (a.x * b.x) + (a.y * b.y); singlereal num_y = (a.y * b.x) - (a.x * b.y); singlereal denom = (b.x * b.x) + (b.y * b.y); c.x = num_x / denom; c.y = num_y / denom #else #define DivideFull(c,a,b) c = a / b #endif // The scalar AXPBY function #if PRECISION == 3232 || PRECISION == 6464 #define AXPBY(e,a,b,c,d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d) #else #define AXPBY(e,a,b,c,d) e = a*b + c*d #endif // The complex conjugate operation for complex transforms #if PRECISION == 3232 || PRECISION == 6464 #define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y #else #define COMPLEX_CONJUGATE(value) #endif // ================================================================================================= // Force inlining functions or not: some compilers don't support the inline keyword #ifdef USE_INLINE_KEYWORD #define INLINE_FUNC inline #else #define INLINE_FUNC #endif // ================================================================================================= // Shuffled workgroup indices to avoid partition camping, see below. For specific devices, this is // enabled (see src/routine.cc). #ifndef USE_STAGGERED_INDICES #define USE_STAGGERED_INDICES 0 #endif // Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from: // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf // More details: https://github.com/CNugteren/CLBlast/issues/53 #if USE_STAGGERED_INDICES == 1 && GEMMK == 0 INLINE_FUNC int GetGroupIDFlat() { return get_group_id(0) + get_num_groups(0) * get_group_id(1); } INLINE_FUNC int GetGroupID1() { return (GetGroupIDFlat()) % get_num_groups(1); } INLINE_FUNC int GetGroupID0() { return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0); } #else INLINE_FUNC int GetGroupID1() { return get_group_id(1); } INLINE_FUNC int GetGroupID0() { return get_group_id(0); } #endif // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level1/000077500000000000000000000000001463263031500164325ustar00rootroot00000000000000CLBlast-1.6.3/src/kernels/level1/level1.opencl000066400000000000000000000112601463263031500210240ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the common functions and parameters specific for level 1 BLAS kernels. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. #ifndef WGS #define WGS 64 // The local work-group size #endif #ifndef WPT #define WPT 1 // The amount of work-per-thread #endif #ifndef VW #define VW 1 // Vector width of vectors X and Y #endif // ================================================================================================= // Data-widths #if VW == 1 typedef real realV; #elif VW == 2 typedef real2 realV; #elif VW == 4 typedef real4 realV; #elif VW == 8 typedef real8 realV; #elif VW == 16 typedef real16 realV; #endif // ================================================================================================= // The vectorized multiply function INLINE_FUNC realV MultiplyVector(realV cvec, const real aval, const realV bvec) { #if VW == 1 Multiply(cvec, aval, bvec); #elif VW == 2 Multiply(cvec.x, aval, bvec.x); Multiply(cvec.y, aval, bvec.y); #elif VW == 4 Multiply(cvec.x, aval, bvec.x); Multiply(cvec.y, aval, bvec.y); Multiply(cvec.z, aval, bvec.z); Multiply(cvec.w, aval, bvec.w); #elif VW == 8 Multiply(cvec.s0, aval, bvec.s0); Multiply(cvec.s1, aval, bvec.s1); Multiply(cvec.s2, aval, bvec.s2); Multiply(cvec.s3, aval, bvec.s3); Multiply(cvec.s4, aval, bvec.s4); Multiply(cvec.s5, aval, bvec.s5); Multiply(cvec.s6, aval, bvec.s6); Multiply(cvec.s7, aval, bvec.s7); #elif VW == 16 Multiply(cvec.s0, aval, bvec.s0); Multiply(cvec.s1, aval, bvec.s1); Multiply(cvec.s2, aval, bvec.s2); Multiply(cvec.s3, aval, bvec.s3); Multiply(cvec.s4, aval, bvec.s4); Multiply(cvec.s5, aval, bvec.s5); Multiply(cvec.s6, aval, bvec.s6); Multiply(cvec.s7, aval, bvec.s7); Multiply(cvec.s8, aval, bvec.s8); Multiply(cvec.s9, aval, bvec.s9); Multiply(cvec.sA, aval, bvec.sA); Multiply(cvec.sB, aval, bvec.sB); Multiply(cvec.sC, aval, bvec.sC); Multiply(cvec.sD, aval, bvec.sD); Multiply(cvec.sE, aval, bvec.sE); Multiply(cvec.sF, aval, bvec.sF); #endif return cvec; } // The vectorized multiply-add function INLINE_FUNC realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) { #if VW == 1 MultiplyAdd(cvec, aval, bvec); #elif VW == 2 MultiplyAdd(cvec.x, aval, bvec.x); MultiplyAdd(cvec.y, aval, bvec.y); #elif VW == 4 MultiplyAdd(cvec.x, aval, bvec.x); MultiplyAdd(cvec.y, aval, bvec.y); MultiplyAdd(cvec.z, aval, bvec.z); MultiplyAdd(cvec.w, aval, bvec.w); #elif VW == 8 MultiplyAdd(cvec.s0, aval, bvec.s0); MultiplyAdd(cvec.s1, aval, bvec.s1); MultiplyAdd(cvec.s2, aval, bvec.s2); MultiplyAdd(cvec.s3, aval, bvec.s3); MultiplyAdd(cvec.s4, aval, bvec.s4); MultiplyAdd(cvec.s5, aval, bvec.s5); MultiplyAdd(cvec.s6, aval, bvec.s6); MultiplyAdd(cvec.s7, aval, bvec.s7); #elif VW == 16 MultiplyAdd(cvec.s0, aval, bvec.s0); MultiplyAdd(cvec.s1, aval, bvec.s1); MultiplyAdd(cvec.s2, aval, bvec.s2); MultiplyAdd(cvec.s3, aval, bvec.s3); MultiplyAdd(cvec.s4, aval, bvec.s4); MultiplyAdd(cvec.s5, aval, bvec.s5); MultiplyAdd(cvec.s6, aval, bvec.s6); MultiplyAdd(cvec.s7, aval, bvec.s7); MultiplyAdd(cvec.s8, aval, bvec.s8); MultiplyAdd(cvec.s9, aval, bvec.s9); MultiplyAdd(cvec.sA, aval, bvec.sA); MultiplyAdd(cvec.sB, aval, bvec.sB); MultiplyAdd(cvec.sC, aval, bvec.sC); MultiplyAdd(cvec.sD, aval, bvec.sD); MultiplyAdd(cvec.sE, aval, bvec.sE); MultiplyAdd(cvec.sF, aval, bvec.sF); #endif return cvec; } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level1/xamax.opencl000066400000000000000000000115371463263031500207610ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the Xamax kernel. It implements index of (absolute) min/max computation using // reduction kernels. Reduction is split in two parts. In the first (main) kernel the X vector is // loaded, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel // is executed with a single workgroup only, computing the final result. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. #ifndef WGS1 #define WGS1 64 // The local work-group size of the main kernel #endif #ifndef WGS2 #define WGS2 64 // The local work-group size of the epilogue kernel #endif // ================================================================================================= // The main reduction kernel, performing the loading and the majority of the operation #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) #endif void Xamax(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global singlereal* maxgm, __global unsigned int* imaxgm) { __local singlereal maxlm[WGS1]; __local unsigned int imaxlm[WGS1]; const int lid = get_local_id(0); const int wgid = get_group_id(0); const int num_groups = get_num_groups(0); // Performs loading and the first steps of the reduction #if defined(ROUTINE_MAX) || defined(ROUTINE_MIN) || defined(ROUTINE_AMIN) singlereal max = SMALLEST; #else singlereal max = ZERO; #endif unsigned int imax = 0; int id = wgid*WGS1 + lid; while (id < n) { const int x_index = id*x_inc + x_offset; #if PRECISION == 3232 || PRECISION == 6464 singlereal x = fabs(xgm[x_index].x) + fabs(xgm[x_index].y); #else singlereal x = xgm[x_index]; #endif #if defined(ROUTINE_MAX) // non-absolute maximum version // nothing special here #elif defined(ROUTINE_MIN) // non-absolute minimum version x = -x; #elif defined(ROUTINE_AMIN) // absolute minimum version x = -fabs(x); #else x = fabs(x); #endif if (x > max) { max = x; imax = id; } id += WGS1*num_groups; } maxlm[lid] = max; imaxlm[lid] = imax; barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory for (int s=WGS1/2; s>0; s=s>>1) { if (lid < s) { if (maxlm[lid + s] > maxlm[lid]) { maxlm[lid] = maxlm[lid + s]; imaxlm[lid] = imaxlm[lid + s]; } } barrier(CLK_LOCAL_MEM_FENCE); } // Stores the per-workgroup result if (lid == 0) { maxgm[wgid] = maxlm[0]; imaxgm[wgid] = imaxlm[0]; } } // ================================================================================================= // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) #endif void XamaxEpilogue(const __global singlereal* restrict maxgm, const __global unsigned int* restrict imaxgm, __global unsigned int* imax, const int imax_offset) { __local singlereal maxlm[WGS2]; __local unsigned int imaxlm[WGS2]; const int lid = get_local_id(0); // Performs the first step of the reduction while loading the data if (maxgm[lid + WGS2] > maxgm[lid]) { maxlm[lid] = maxgm[lid + WGS2]; imaxlm[lid] = imaxgm[lid + WGS2]; } else { maxlm[lid] = maxgm[lid]; imaxlm[lid] = imaxgm[lid]; } barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory for (int s=WGS2/2; s>0; s=s>>1) { if (lid < s) { if (maxlm[lid + s] > maxlm[lid]) { maxlm[lid] = maxlm[lid + s]; imaxlm[lid] = imaxlm[lid + s]; } } barrier(CLK_LOCAL_MEM_FENCE); } // Stores the final result if (lid == 0) { imax[imax_offset] = imaxlm[0]; } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level1/xasum.opencl000066400000000000000000000077041463263031500210010ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the Xasum kernel. It implements a absolute sum computation using reduction // kernels. Reduction is split in two parts. In the first (main) kernel the X vector is loaded, // followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel // is executed with a single workgroup only, computing the final result. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. #ifndef WGS1 #define WGS1 64 // The local work-group size of the main kernel #endif #ifndef WGS2 #define WGS2 64 // The local work-group size of the epilogue kernel #endif // ================================================================================================= // The main reduction kernel, performing the loading and the majority of the operation #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) #endif void Xasum(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* output) { __local real lm[WGS1]; const int lid = get_local_id(0); const int wgid = get_group_id(0); const int num_groups = get_num_groups(0); // Performs loading and the first steps of the reduction real acc; SetToZero(acc); int id = wgid*WGS1 + lid; while (id < n) { real x = xgm[id*x_inc + x_offset]; #if defined(ROUTINE_SUM) // non-absolute version #else AbsoluteValue(x); #endif Add(acc, acc, x); id += WGS1*num_groups; } lm[lid] = acc; barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory for (int s=WGS1/2; s>0; s=s>>1) { if (lid < s) { Add(lm[lid], lm[lid], lm[lid + s]); } barrier(CLK_LOCAL_MEM_FENCE); } // Stores the per-workgroup result if (lid == 0) { output[wgid] = lm[0]; } } // ================================================================================================= // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) #endif void XasumEpilogue(const __global real* restrict input, __global real* asum, const int asum_offset) { __local real lm[WGS2]; const int lid = get_local_id(0); // Performs the first step of the reduction while loading the data Add(lm[lid], input[lid], input[lid + WGS2]); barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory for (int s=WGS2/2; s>0; s=s>>1) { if (lid < s) { Add(lm[lid], lm[lid], lm[lid + s]); } barrier(CLK_LOCAL_MEM_FENCE); } // Computes the absolute value and stores the final result if (lid == 0) { #if (PRECISION == 3232 || PRECISION == 6464) && defined(ROUTINE_ASUM) asum[asum_offset].x = lm[0].x + lm[0].y; // the result is a non-complex number #else asum[asum_offset] = lm[0]; #endif } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level1/xaxpy.opencl000066400000000000000000000106351463263031500210120ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't // support vector data-types. The general version has a batched implementation as well. // // This kernel uses the level-1 BLAS common tuning parameters. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void Xaxpy(const int n, const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { const real alpha = GetRealArg(arg_alpha); // Loops over the work that needs to be done (allows for an arbitrary number of threads) for (int id = get_global_id(0); id < n; id += get_global_size(0)) { real xvalue = xgm[id*x_inc + x_offset]; MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xvalue); } } // Faster version of the kernel without offsets and strided accesses but with if-statement. Also // assumes that 'n' is dividable by 'VW' and 'WPT'. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XaxpyFaster(const int n, const real_arg arg_alpha, const __global realV* restrict xgm, __global realV* ygm) { const real alpha = GetRealArg(arg_alpha); const int num_usefull_threads = n / (VW * WPT); if (get_global_id(0) < num_usefull_threads) { #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id = _w*num_usefull_threads + get_global_id(0); realV xvalue = xgm[id]; realV yvalue = ygm[id]; ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue); } } } // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XaxpyFastest(const int n, const real_arg arg_alpha, const __global realV* restrict xgm, __global realV* ygm) { const real alpha = GetRealArg(arg_alpha); #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id = _w*get_global_size(0) + get_global_id(0); realV xvalue = xgm[id]; realV yvalue = ygm[id]; ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue); } } // ================================================================================================= // Full version of the kernel with offsets and strided accesses: batched version #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XaxpyBatched(const int n, const __constant real_arg* arg_alphas, const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc, __global real* ygm, const __constant int* y_offsets, const int y_inc) { const int batch = get_group_id(1); const real alpha = GetRealArg(arg_alphas[batch]); // Loops over the work that needs to be done (allows for an arbitrary number of threads) for (int id = get_global_id(0); id < n; id += get_global_size(0)) { real xvalue = xgm[id*x_inc + x_offsets[batch]]; MultiplyAdd(ygm[id*y_inc + y_offsets[batch]], alpha, xvalue); } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level1/xcopy.opencl000066400000000000000000000047301463263031500210020ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the Xcopy kernel. It contains one fast vectorized version in case of unit // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't // support vector data-types. // // This kernel uses the level-1 BLAS common tuning parameters. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void Xcopy(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { // Loops over the work that needs to be done (allows for an arbitrary number of threads) for (int id = get_global_id(0); id // // This file contains the Xdot kernel. It implements a dot-product computation using reduction // kernels. Reduction is split in two parts. In the first (main) kernel the X and Y vectors are // multiplied, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel // is executed with a single workgroup only, computing the final result. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. #ifndef WGS1 #define WGS1 64 // The local work-group size of the main kernel #endif #ifndef WGS2 #define WGS2 64 // The local work-group size of the epilogue kernel #endif // ================================================================================================= // The main reduction kernel, performing the multiplication and the majority of the sum operation #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) #endif void Xdot(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict ygm, const int y_offset, const int y_inc, __global real* output, const int do_conjugate) { __local real lm[WGS1]; const int lid = get_local_id(0); const int wgid = get_group_id(0); const int num_groups = get_num_groups(0); // Performs multiplication and the first steps of the reduction real acc; SetToZero(acc); int id = wgid*WGS1 + lid; while (id < n) { real x = xgm[id*x_inc + x_offset]; real y = ygm[id*y_inc + y_offset]; if (do_conjugate) { COMPLEX_CONJUGATE(x); } MultiplyAdd(acc, x, y); id += WGS1*num_groups; } lm[lid] = acc; barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory for (int s=WGS1/2; s>0; s=s>>1) { if (lid < s) { Add(lm[lid], lm[lid], lm[lid + s]); } barrier(CLK_LOCAL_MEM_FENCE); } // Stores the per-workgroup result if (lid == 0) { output[wgid] = lm[0]; } } // ================================================================================================= // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to // be launched with a single workgroup only. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) #endif void XdotEpilogue(const __global real* restrict input, __global real* dot, const int dot_offset) { __local real lm[WGS2]; const int lid = get_local_id(0); // Performs the first step of the reduction while loading the data Add(lm[lid], input[lid], input[lid + WGS2]); barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory for (int s=WGS2/2; s>0; s=s>>1) { if (lid < s) { Add(lm[lid], lm[lid], lm[lid + s]); } barrier(CLK_LOCAL_MEM_FENCE); } // Stores the final result if (lid == 0) { dot[dot_offset] = lm[0]; } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level1/xhad.opencl000066400000000000000000000140501463263031500205600ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the Xhad kernel. It contains one fast vectorized version in case of unit // strides (incx=incy=incz=1) and no offsets (offx=offy=offz=0). Another version is more general, // but doesn't support vector data-types. Based on the XAXPY kernels. // // This kernel uses the level-1 BLAS common tuning parameters. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // A vector-vector multiply function. See also level1.opencl for a vector-scalar version INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV bvec) { #if VW == 1 Multiply(cvec, aval, bvec); #elif VW == 2 Multiply(cvec.x, aval.x, bvec.x); Multiply(cvec.y, aval.y, bvec.y); #elif VW == 4 Multiply(cvec.x, aval.x, bvec.x); Multiply(cvec.y, aval.y, bvec.y); Multiply(cvec.z, aval.z, bvec.z); Multiply(cvec.w, aval.w, bvec.w); #elif VW == 8 Multiply(cvec.s0, aval.s0, bvec.s0); Multiply(cvec.s1, aval.s1, bvec.s1); Multiply(cvec.s2, aval.s2, bvec.s2); Multiply(cvec.s3, aval.s3, bvec.s3); Multiply(cvec.s4, aval.s4, bvec.s4); Multiply(cvec.s5, aval.s5, bvec.s5); Multiply(cvec.s6, aval.s6, bvec.s6); Multiply(cvec.s7, aval.s7, bvec.s7); #elif VW == 16 Multiply(cvec.s0, aval.s0, bvec.s0); Multiply(cvec.s1, aval.s1, bvec.s1); Multiply(cvec.s2, aval.s2, bvec.s2); Multiply(cvec.s3, aval.s3, bvec.s3); Multiply(cvec.s4, aval.s4, bvec.s4); Multiply(cvec.s5, aval.s5, bvec.s5); Multiply(cvec.s6, aval.s6, bvec.s6); Multiply(cvec.s7, aval.s7, bvec.s7); Multiply(cvec.s8, aval.s8, bvec.s8); Multiply(cvec.s9, aval.s9, bvec.s9); Multiply(cvec.sA, aval.sA, bvec.sA); Multiply(cvec.sB, aval.sB, bvec.sB); Multiply(cvec.sC, aval.sC, bvec.sC); Multiply(cvec.sD, aval.sD, bvec.sD); Multiply(cvec.sE, aval.sE, bvec.sE); Multiply(cvec.sF, aval.sF, bvec.sF); #endif return cvec; } // ================================================================================================= // Full version of the kernel with offsets and strided accesses #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict ygm, const int y_offset, const int y_inc, __global real* zgm, const int z_offset, const int z_inc) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); // Loops over the work that needs to be done (allows for an arbitrary number of threads) for (int id = get_global_id(0); id < n; id += get_global_size(0)) { real xvalue = xgm[id*x_inc + x_offset]; real yvalue = ygm[id*y_inc + y_offset]; real zvalue = zgm[id*z_inc + z_offset]; real result; real alpha_times_x; Multiply(alpha_times_x, alpha, xvalue); Multiply(result, alpha_times_x, yvalue); MultiplyAdd(result, beta, zvalue); zgm[id*z_inc + z_offset] = result; } } // Faster version of the kernel without offsets and strided accesses but with if-statement. Also // assumes that 'n' is dividable by 'VW' and 'WPT'. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta, const __global realV* restrict xgm, const __global realV* restrict ygm, __global realV* zgm) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); const int num_desired_threads = n / (VW * WPT); if (get_global_id(0) < num_desired_threads) { #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id = _w * num_desired_threads + get_global_id(0); realV xvalue = xgm[id]; realV yvalue = ygm[id]; realV zvalue = zgm[id]; realV result; realV alpha_times_x; alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue); result = MultiplyVectorVector(result, alpha_times_x, yvalue); zgm[id] = MultiplyAddVector(result, beta, zvalue); } } } // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta, const __global realV* restrict xgm, const __global realV* restrict ygm, __global realV* zgm) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id = _w*get_global_size(0) + get_global_id(0); realV xvalue = xgm[id]; realV yvalue = ygm[id]; realV zvalue = zgm[id]; realV result; realV alpha_times_x; alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue); result = MultiplyVectorVector(result, alpha_times_x, yvalue); zgm[id] = MultiplyAddVector(result, beta, zvalue); } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level1/xnrm2.opencl000066400000000000000000000076131463263031500207110ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the Xnrm2 kernel. It implements a squared norm computation using reduction // kernels. Reduction is split in two parts. In the first (main) kernel the X vector is squared, // followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel // is executed with a single workgroup only, computing the final result. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. #ifndef WGS1 #define WGS1 64 // The local work-group size of the main kernel #endif #ifndef WGS2 #define WGS2 64 // The local work-group size of the epilogue kernel #endif // ================================================================================================= // The main reduction kernel, performing the multiplication and the majority of the operation #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) #endif void Xnrm2(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* output) { __local real lm[WGS1]; const int lid = get_local_id(0); const int wgid = get_group_id(0); const int num_groups = get_num_groups(0); // Performs multiplication and the first steps of the reduction real acc; SetToZero(acc); int id = wgid*WGS1 + lid; while (id < n) { real x1 = xgm[id*x_inc + x_offset]; real x2 = x1; COMPLEX_CONJUGATE(x2); MultiplyAdd(acc, x1, x2); id += WGS1*num_groups; } lm[lid] = acc; barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory for (int s=WGS1/2; s>0; s=s>>1) { if (lid < s) { Add(lm[lid], lm[lid], lm[lid + s]); } barrier(CLK_LOCAL_MEM_FENCE); } // Stores the per-workgroup result if (lid == 0) { output[wgid] = lm[0]; } } // ================================================================================================= // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) #endif void Xnrm2Epilogue(const __global real* restrict input, __global real* nrm2, const int nrm2_offset) { __local real lm[WGS2]; const int lid = get_local_id(0); // Performs the first step of the reduction while loading the data Add(lm[lid], input[lid], input[lid + WGS2]); barrier(CLK_LOCAL_MEM_FENCE); // Performs reduction in local memory for (int s=WGS2/2; s>0; s=s>>1) { if (lid < s) { Add(lm[lid], lm[lid], lm[lid + s]); } barrier(CLK_LOCAL_MEM_FENCE); } // Computes the square root and stores the final result if (lid == 0) { #if PRECISION == 3232 || PRECISION == 6464 nrm2[nrm2_offset].x = sqrt(lm[0].x); // the result is a non-complex number #else nrm2[nrm2_offset] = sqrt(lm[0]); #endif } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level1/xscal.opencl000066400000000000000000000052061463263031500207510ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the Xscal kernel. It contains one fast vectorized version in case of unit // strides (incx=1) and no offsets (offx=0). Another version is more general, but doesn't support // vector data-types. // // This kernel uses the level-1 BLAS common tuning parameters. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void Xscal(const int n, const real_arg arg_alpha, __global real* xgm, const int x_offset, const int x_inc) { const real alpha = GetRealArg(arg_alpha); // Loops over the work that needs to be done (allows for an arbitrary number of threads) for (int id = get_global_id(0); id // // This file contains the Xswap kernel. It contains one fast vectorized version in case of unit // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't // support vector data-types. // // This kernel uses the level-1 BLAS common tuning parameters. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void Xswap(const int n, __global real* xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { // Loops over the work that needs to be done (allows for an arbitrary number of threads) for (int id = get_global_id(0); id // // This file contains common functions for matrix update kernels (Xger, Xher). // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. #ifndef WGS1 #define WGS1 8 // The local work-group size in first dimension #endif #ifndef WGS2 #define WGS2 8 // The local work-group size in second dimension #endif #ifndef WPT #define WPT 1 // The amount of work-per-thread in both dimensions #endif // ================================================================================================= // Returns an element from a vector INLINE_FUNC real LoadVector(const int id, const int max, const __global real* gm, const int offset, const int inc, const int do_conjugate) { if (id < max) { real result = gm[id*inc + offset]; if (do_conjugate) { #if defined(ROUTINE_GERC) || defined(ROUTINE_HER) || defined(ROUTINE_HPR) || defined(ROUTINE_HER2) || defined(ROUTINE_HPR2) COMPLEX_CONJUGATE(result); #endif } return result; } else { real default_result; SetToZero(default_result); return default_result; } } // Performs the rank-1 matrix update INLINE_FUNC void MatrixUpdate(const int id1, const int id2, const int max1, const int max2, __global real* agm, const int a_offset, const int a_ld, const real alpha, const real xvalue, const real yvalue, const int is_upper) { // Bounds of a regular matrix if (id1 < max1 && id2 < max2) { #if defined(ROUTINE_SPR) || defined(ROUTINE_HPR) int a_index; if (is_upper) { a_index = (id1 <= id2) ? ((id2+1)*id2)/2 + id1 : ((id1+1)*id1)/2 + id2; } else { a_index = (id1 >= id2) ? ((2*a_ld-(id2+1))*id2)/2 + id1 : ((2*a_ld-(id1+1))*id1)/2 + id2; } a_index += a_offset; #else const int a_index = id2*a_ld + id1 + a_offset; #endif // Loads the current value of the A matrix const real avalue = agm[a_index]; // Computes result = alpha * x[i] * y[j] + a[i][j] #if PRECISION == 3232 || PRECISION == 6464 real ax; ax.x = MulReal(alpha, xvalue); ax.y = MulImag(alpha, xvalue); real result; result.x = MulReal(ax, yvalue) + avalue.x; result.y = MulImag(ax, yvalue) + avalue.y; #else real result = alpha * xvalue * yvalue + avalue; #endif // For hermetian matrices #if defined(ROUTINE_HER) || defined(ROUTINE_HPR) if (id1 == id2) { result.y = ZERO; } #endif // Stores the final result agm[a_index] = result; } } // Performs the rank-2 matrix update INLINE_FUNC void MatrixUpdate2(const int id1, const int id2, const int max1, const int max2, __global real* agm, const int a_offset, const int a_ld, const real alpha1, const real xvalue, const real yvalue, const real alpha2, const real xtvalue, const real ytvalue, const int is_upper) { // Bounds of a regular matrix if (id1 < max1 && id2 < max2) { #if defined(ROUTINE_SPR2) || defined(ROUTINE_HPR2) int a_index; if (is_upper) { a_index = (id1 <= id2) ? ((id2+1)*id2)/2 + id1 : ((id1+1)*id1)/2 + id2; } else { a_index = (id1 >= id2) ? ((2*a_ld-(id2+1))*id2)/2 + id1 : ((2*a_ld-(id1+1))*id1)/2 + id2; } a_index += a_offset; #else const int a_index = id2*a_ld + id1 + a_offset; #endif // Loads the current value of the A matrix const real avalue = agm[a_index]; // Computes result = alpha * x[i] * y[j] + alpha * x[j] * y[i] + a[i][j] #if PRECISION == 3232 || PRECISION == 6464 real ax; ax.x = MulReal(alpha2, xvalue); ax.y = MulImag(alpha2, xvalue); real atx; atx.x = MulReal(alpha1, xtvalue); atx.y = MulImag(alpha1, xtvalue); real result; result.x = MulReal(ax, yvalue) + MulReal(atx, ytvalue) + avalue.x; result.y = MulImag(ax, yvalue) + MulImag(atx, ytvalue) + avalue.y; #else real result = alpha1 * xvalue * yvalue + alpha2 * xtvalue * ytvalue + avalue; #endif // For hermetian matrices #if defined(ROUTINE_HER2) || defined(ROUTINE_HPR2) if (id1 == id2) { result.y = ZERO; } #endif // Stores the final result agm[a_index] = result; } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level2/xgemv.opencl000066400000000000000000000242361463263031500207720ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the Xgemv kernel (generic version) for matrix-vector multiplication. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. // 1: For the full version of the kernel #ifndef WGS1 #define WGS1 64 // The local work-group size #endif #ifndef WPT1 #define WPT1 1 // The amount of work-per-thread #endif #ifndef UNROLL1 #define UNROLL1 32 // Unroll factor (must be a divider of WGS1) #endif // 2 and 3: For the fast versions, see 'xgemv_fast.opencl' // ================================================================================================= // Defines how to load the input matrix in the non-vectorized case INLINE_FUNC real LoadMatrixA(const __global real* restrict agm, const int x, const int y, const int a_ld, const int a_offset, const int parameter, const int kl, const int ku) { real result; // For banded matrices #if defined(ROUTINE_GBMV) const int k = ku - y; if (x >= y-ku && x < y+kl+1) { result = agm[a_ld*y + k + x + a_offset]; } else { SetToZero(result); } // For symmetric/hermitian matrices #elif defined(ROUTINE_HEMV) || defined(ROUTINE_SYMV) if ((parameter == 0 && y <= x) || (parameter == 1 && x <= y)) { result = agm[a_ld*y + x + a_offset]; #if defined(ROUTINE_HEMV) if (x == y) { result.y = ZERO; } #endif } else { result = agm[a_ld*x + y + a_offset]; #if defined(ROUTINE_HEMV) COMPLEX_CONJUGATE(result); #endif } // For triangular matrices #elif defined(ROUTINE_TRMV) if (((parameter == 0 || parameter == 2) && y <= x) || ((parameter == 1 || parameter == 3) && x <= y)) { result = agm[a_ld*y + x + a_offset]; if (parameter >= 2 && y == x) { SetToOne(result); } } else { SetToZero(result); } // For symmetric/hermitian banded matrices #elif defined(ROUTINE_HBMV) || defined(ROUTINE_SBMV) if (parameter == 1) { if (x <= y) { const int m = kl - y; if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; } else { SetToZero(result); } #if defined(ROUTINE_HBMV) if (x == y) { result.y = ZERO; } #endif } else { const int m = kl - x; if (y >= x-kl && y <= x) { result = agm[a_ld*x + m + y + a_offset]; } else { SetToZero(result); } #if defined(ROUTINE_HBMV) COMPLEX_CONJUGATE(result); #endif } } else { if (x >= y) { const int m = -y; if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; } else { SetToZero(result); } #if defined(ROUTINE_HBMV) if (x == y) { result.y = ZERO; } #endif } else { const int m = -x; if (y >= x && y < x+kl+1) { result = agm[a_ld*x + m + y + a_offset]; } else { SetToZero(result); } #if defined(ROUTINE_HBMV) COMPLEX_CONJUGATE(result); #endif } } // For triangular banded matrices #elif defined(ROUTINE_TBMV) if (parameter == 1 || parameter == 3) { if (x <= y) { const int m = kl - y; if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; } else { SetToZero(result); } if (parameter >= 2 && y == x) { SetToOne(result); } } else { SetToZero(result); } } else { if (x >= y) { const int m = -y; if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; } else { SetToZero(result); } if (parameter >= 2 && y == x) { SetToOne(result); } } else { SetToZero(result); } } // For symmetric/hermitian packed matrices #elif defined(ROUTINE_HPMV) || defined(ROUTINE_SPMV) if (parameter == 1) { if (x <= y) { result = agm[((y+1)*y)/2 + x + a_offset]; #if defined(ROUTINE_HPMV) if (x == y) { result.y = ZERO; } #endif } else { result = agm[((x+1)*x)/2 + y + a_offset]; #if defined(ROUTINE_HPMV) COMPLEX_CONJUGATE(result); #endif } } else { if (x >= y) { result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset]; #if defined(ROUTINE_HPMV) if (x == y) { result.y = ZERO; } #endif } else { result = agm[((2*a_ld-(x+1))*x)/2 + y + a_offset]; #if defined(ROUTINE_HPMV) COMPLEX_CONJUGATE(result); #endif } } // For triangular packed matrices #elif defined(ROUTINE_TPMV) if (parameter == 1 || parameter == 3) { if (x <= y) { result = agm[((y+1)*y)/2 + x + a_offset]; if (parameter >= 2 && y == x) { SetToOne(result); } } else { SetToZero(result); } } else { if (x >= y) { result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset]; if (parameter >= 2 && y == x) { SetToOne(result); } } else { SetToZero(result); } } // For general matrices #else result = agm[a_ld*y + x + a_offset]; #endif return result; } // ================================================================================================= // Full version of the kernel #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) #endif void Xgemv(const int m, const int n, const real_arg arg_alpha, const real_arg arg_beta, const int a_rotated, const __global real* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, const int do_conjugate, const int parameter, const int kl, const int ku) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); // Local memory for the vector X __local real xlm[WGS1]; // Initializes the accumulation register #pragma promote_to_registers real acc1[WPT1]; #pragma unroll for (int _w = 0; _w < WPT1; _w += 1) { SetToZero(acc1[_w]); } // Divides the work in a main and tail section const int n_tail = n % WGS1; const int n_floor = n - n_tail; // Loops over work-group sized portions of the work for (int kwg=0; kwg // // This file contains the Xgemv kernel (fast versions) for matrix-vector multiplication. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. // 1: For the full version, see 'xgemv.opencl' // 2: For the fast version #ifndef WGS2 #define WGS2 64 // The local work-group size #endif #ifndef WPT2 #define WPT2 1 // The amount of work-per-thread #endif #ifndef VW2 #define VW2 1 // Vector width of matrix A loads #endif // 3: For the fast rotated version #ifndef WGS3 #define WGS3 64 // The local work-group size #endif #ifndef WPT3 #define WPT3 1 // The tile-size #endif #ifndef VW3 #define VW3 1 // Vector width of matrix A loads #endif // ================================================================================================= // Data-widths for the 'fast' kernel #if VW2 == 1 typedef real realVF; #elif VW2 == 2 typedef real2 realVF; #elif VW2 == 4 typedef real4 realVF; #elif VW2 == 8 typedef real8 realVF; #elif VW2 == 16 typedef real16 realVF; #endif // Data-widths for the 'fast' kernel with rotated matrix #if VW3 == 1 typedef real realVFR; #elif VW3 == 2 typedef real2 realVFR; #elif VW3 == 4 typedef real4 realVFR; #elif VW3 == 8 typedef real8 realVFR; #elif VW3 == 16 typedef real16 realVFR; #endif // ================================================================================================= // Loads a vector input value INLINE_FUNC realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y, const int a_ld) { return agm[a_ld*y + x]; } // ================================================================================================= // Faster version of the kernel, assuming that: // --> 'm' and 'n' are multiples of WGS2 // --> 'a_offset' is 0 // --> 'a_ld' is a multiple of VW2 // --> 'a_rotated' is 0 // --> 'do_conjugate' is 0 #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) #endif void XgemvFast(const int m, const int n, const real_arg arg_alpha, const real_arg arg_beta, const int a_rotated, const __global realVF* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, const int do_conjugate, const int parameter, const int kl_unused, const int ku_unused) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); // Local memory for the vector X __local real xlm[WGS2]; // Initializes the accumulation registers #pragma promote_to_registers real acc2[WPT2]; #pragma unroll for (int _w = 0; _w < WPT2; _w += 1) { SetToZero(acc2[_w]); } // Loops over work-group sized portions of the work for (int kwg=0; kwg 'm' and 'n' are multiples of WGS3 // --> 'a_offset' is 0 // --> 'a_ld' is a multiple of VW3 // --> 'a_rotated' is 1 // --> 'do_conjugate' is 0 #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS3, 1, 1))) #endif void XgemvFastRot(const int m, const int n, const real_arg arg_alpha, const real_arg arg_beta, const int a_rotated, const __global realVFR* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, const int do_conjugate, const int parameter, const int kl_unused, const int ku_unused) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); // Local memory to store a tile of the matrix (for coalescing) __local real tile[WPT3][WGS3]; const int lid = get_local_id(0); const int lid_mod = lid % (WPT3/VW3); const int lid_div = lid / (WPT3/VW3); // Local memory for the vector X __local real xlm[WPT3]; // Initializes the accumulation register real acc3; SetToZero(acc3); // Loops over tile-sized portions of the work for (int kwg=0; kwg // // This file contains the Xger kernels for rank-1 matrix update. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC) #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) #endif void Xger(const int max1, const int max2, const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* ygm, const int y_offset, const int y_inc, __global real* restrict agm, const int a_offset, const int a_ld, const int is_rowmajor) { const real alpha = GetRealArg(arg_alpha); // Register storage for X and Y #pragma promote_to_registers real xvalues[WPT]; #pragma promote_to_registers real yvalues[WPT]; // Row-major version if (is_rowmajor) { // Loads the X-vector #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id2 = _w*get_global_size(1) + get_global_id(1); xvalues[_w] = LoadVector(id2, max2, xgm, x_offset, x_inc, false); } // Loads the Y-vector #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id1 = _w*get_global_size(0) + get_global_id(0); yvalues[_w] = LoadVector(id1, max1, ygm, y_offset, y_inc, true); } // Loops over the work per thread twice #pragma unroll for (int _w1 = 0; _w1 < WPT; _w1 += 1) { #pragma unroll for (int _w2 = 0; _w2 < WPT; _w2 += 1) { // Global thread IDs const int id1 = _w1*get_global_size(0) + get_global_id(0); const int id2 = _w2*get_global_size(1) + get_global_id(1); // Loads A, performs the operation, and stores the result into A MatrixUpdate(id1, id2, max1, max2, agm, a_offset, a_ld, alpha, xvalues[_w2], yvalues[_w1], false); } } } // Col-major version else { // Loads the X-vector #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id1 = _w*get_global_size(0) + get_global_id(0); xvalues[_w] = LoadVector(id1, max1, xgm, x_offset, x_inc, false); } // Loads the Y-vector #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id2 = _w*get_global_size(1) + get_global_id(1); yvalues[_w] = LoadVector(id2, max2, ygm, y_offset, y_inc, true); } // Loops over the work per thread twice #pragma unroll for (int _w1 = 0; _w1 < WPT; _w1 += 1) { #pragma unroll for (int _w2 = 0; _w2 < WPT; _w2 += 1) { // Global thread IDs const int id1 = _w1*get_global_size(0) + get_global_id(0); const int id2 = _w2*get_global_size(1) + get_global_id(1); // Loads A, performs the operation, and stores the result into A MatrixUpdate(id1, id2, max1, max2, agm, a_offset, a_ld, alpha, xvalues[_w1], yvalues[_w2], false); } } } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level2/xher.opencl000066400000000000000000000055701463263031500206120ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the Xher kernels for rank-1 matrix update. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR) #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) #endif void Xher(const int n, const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* restrict agm, const int a_offset, const int a_ld, const int is_upper, const int is_rowmajor) { const real alpha = GetRealArg(arg_alpha); // Register storage for X and XT #pragma promote_to_registers real xvalues[WPT]; #pragma promote_to_registers real xtvalues[WPT]; // Loads the X-vector #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id2 = _w*get_global_size(1) + get_global_id(1); xvalues[_w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor); } // Loads the X-transposed-vector #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id1 = _w*get_global_size(0) + get_global_id(0); xtvalues[_w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor); } // Loops over the work per thread twice #pragma unroll for (int _w1 = 0; _w1 < WPT; _w1 += 1) { #pragma unroll for (int _w2 = 0; _w2 < WPT; _w2 += 1) { // Global thread IDs const int id1 = _w1*get_global_size(0) + get_global_id(0); const int id2 = _w2*get_global_size(1) + get_global_id(1); // Skip these threads if they do not contain threads contributing to the matrix-triangle if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) { // Do nothing } // Loads A, performs the operation, and stores the result into A else { MatrixUpdate(id1, id2, n, n, agm, a_offset, a_ld, alpha, xvalues[_w2], xtvalues[_w1], is_upper); } } } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level2/xher2.opencl000066400000000000000000000075661463263031500207030ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the Xher2 kernels for rank-2 matrix update. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2) #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) #endif void Xher2(const int n, const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict ygm, const int y_offset, const int y_inc, __global real* restrict agm, const int a_offset, const int a_ld, const int is_upper, const int is_rowmajor) { const real alpha = GetRealArg(arg_alpha); // Register storage for X and Y #pragma promote_to_registers real xvalues[WPT]; #pragma promote_to_registers real yvalues[WPT]; #pragma promote_to_registers real xtvalues[WPT]; #pragma promote_to_registers real ytvalues[WPT]; // Loads the X-vector #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id2 = _w*get_global_size(1) + get_global_id(1); xvalues[_w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor); } // Loads the X-transposed-vector #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id1 = _w*get_global_size(0) + get_global_id(0); xtvalues[_w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor); } // Loads the Y-vector #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id1 = _w*get_global_size(0) + get_global_id(0); yvalues[_w] = LoadVector(id1, n, ygm, y_offset, y_inc, is_rowmajor); } // Loads the Y-transposed-vector #pragma unroll for (int _w = 0; _w < WPT; _w += 1) { const int id2 = _w*get_global_size(1) + get_global_id(1); ytvalues[_w] = LoadVector(id2, n, ygm, y_offset, y_inc, !is_rowmajor); } // Sets the proper value of alpha in case conjugation is needed real alpha1 = alpha; real alpha2 = alpha; #if defined(ROUTINE_HER2) || defined(ROUTINE_HPR2) if (is_rowmajor) { COMPLEX_CONJUGATE(alpha1); } else { COMPLEX_CONJUGATE(alpha2); } #endif // Loops over the work per thread twice #pragma unroll for (int _w1 = 0; _w1 < WPT; _w1 += 1) { #pragma unroll for (int _w2 = 0; _w2 < WPT; _w2 += 1) { // Global thread IDs const int id1 = _w1*get_global_size(0) + get_global_id(0); const int id2 = _w2*get_global_size(1) + get_global_id(1); // Skip these threads if they do not contain threads contributing to the matrix-triangle if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) { // Do nothing } // Loads A, performs the operation, and stores the result into A else { MatrixUpdate2(id1, id2, n, n, agm, a_offset, a_ld, alpha1, xvalues[_w2], yvalues[_w1], alpha2, xtvalues[_w1], ytvalues[_w2], is_upper); } } } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level2/xtrsv.opencl000066400000000000000000000115601463263031500210260ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains kernels to perform forward or backward substition, as used in the TRSV routine // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= #if defined(ROUTINE_TRSV) __kernel void FillVector(const int n, const int inc, const int offset, __global real* restrict dest, const real_arg arg_value) { const real value = GetRealArg(arg_value); const int tid = get_global_id(0); if (tid < n) { dest[tid*inc + offset] = value; } } // ================================================================================================= // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. #ifndef TRSV_BLOCK_SIZE #define TRSV_BLOCK_SIZE 32 // The block size for forward or backward substition #endif // ================================================================================================= #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1))) #endif void trsv_forward(int n, const __global real *A, const int a_offset, int a_ld, __global real *b, const int b_offset, int b_inc, __global real *x, const int x_offset, int x_inc, const int is_transposed, const int is_unit_diagonal, const int do_conjugate) { __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE]; __local real xlm[TRSV_BLOCK_SIZE]; const int tid = get_local_id(0); // Pre-loads the data into local memory if (tid < n) { Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]); if (is_transposed == 0) { for (int i = 0; i < n; ++i) { alm[i][tid] = A[i + tid*a_ld + a_offset]; } } else { for (int i = 0; i < n; ++i) { alm[i][tid] = A[tid + i*a_ld + a_offset]; } } if (do_conjugate) { for (int i = 0; i < n; ++i) { COMPLEX_CONJUGATE(alm[i][tid]); } } } barrier(CLK_LOCAL_MEM_FENCE); // Computes the result (single-threaded for now) if (tid == 0) { for (int i = 0; i < n; ++i) { for (int j = 0; j < i; ++j) { MultiplySubtract(xlm[i], alm[i][j], xlm[j]); } if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); } } } barrier(CLK_LOCAL_MEM_FENCE); // Stores the results if (tid < n) { x[tid*x_inc + x_offset] = xlm[tid]; } } #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1))) #endif void trsv_backward(int n, const __global real *A, const int a_offset, int a_ld, __global real *b, const int b_offset, int b_inc, __global real *x, const int x_offset, int x_inc, const int is_transposed, const int is_unit_diagonal, const int do_conjugate) { __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE]; __local real xlm[TRSV_BLOCK_SIZE]; const int tid = get_local_id(0); // Pre-loads the data into local memory if (tid < n) { Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]); if (is_transposed == 0) { for (int i = 0; i < n; ++i) { alm[i][tid] = A[i + tid*a_ld + a_offset]; } } else { for (int i = 0; i < n; ++i) { alm[i][tid] = A[tid + i*a_ld + a_offset]; } } if (do_conjugate) { for (int i = 0; i < n; ++i) { COMPLEX_CONJUGATE(alm[i][tid]); } } } barrier(CLK_LOCAL_MEM_FENCE); // Computes the result (single-threaded for now) if (tid == 0) { for (int i = n - 1; i >= 0; --i) { for (int j = i + 1; j < n; ++j) { MultiplySubtract(xlm[i], alm[i][j], xlm[j]); } if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); } } } barrier(CLK_LOCAL_MEM_FENCE); // Stores the results if (tid < n) { x[tid*x_inc + x_offset] = xlm[tid]; } } #endif // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/000077500000000000000000000000001463263031500164345ustar00rootroot00000000000000CLBlast-1.6.3/src/kernels/level3/convert_hermitian.opencl000066400000000000000000000105401463263031500233560ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains kernels to convert hermitian matrices to/from general matrices. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= #if defined(ROUTINE_HEMM) #if PRECISION == 3232 || PRECISION == 6464 // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void HermLowerToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_dim, const int dest_ld, const int dest_offset, __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_dim && id_one < dest_dim) { // Loads data from the lower-hermitian matrix real result; SetToZero(result); if (id_two < src_dim && id_one < src_dim) { if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; if (id_one == id_two) { result.y = ZERO; } } else { result = src[id_one*src_ld + id_two + src_offset]; COMPLEX_CONJUGATE(result); } } // Stores the result in the destination matrix dest[id_two*dest_ld + id_one + dest_offset] = result; } } } } // Same as above, but now the matrix' data is stored in the upper-triangle #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void HermUpperToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_dim, const int dest_ld, const int dest_offset, __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_dim && id_one < dest_dim) { // Loads data from the upper-hermitian matrix real result; SetToZero(result); if (id_two < src_dim && id_one < src_dim) { if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; if (id_one == id_two) { result.y = ZERO; } } else { result = src[id_one*src_ld + id_two + src_offset]; COMPLEX_CONJUGATE(result); } } // Stores the result in the destination matrix dest[id_two*dest_ld + id_one + dest_offset] = result; } } } } #endif #endif // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/convert_symmetric.opencl000066400000000000000000000100741463263031500234140ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains kernels to convert symmetric matrices to/from general matrices. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= #if defined(ROUTINE_SYMM) // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void SymmLowerToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_dim, const int dest_ld, const int dest_offset, __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_dim && id_one < dest_dim) { // Loads data from the lower-symmetric matrix real result; SetToZero(result); if (id_two < src_dim && id_one < src_dim) { if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; } else { result = src[id_one*src_ld + id_two + src_offset]; } } // Stores the result in the destination matrix dest[id_two*dest_ld + id_one + dest_offset] = result; } } } } // Same as above, but now the matrix' data is stored in the upper-triangle #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void SymmUpperToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_dim, const int dest_ld, const int dest_offset, __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_dim && id_one < dest_dim) { // Loads data from the upper-symmetric matrix real result; SetToZero(result); if (id_two < src_dim && id_one < src_dim) { if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; } else { result = src[id_one*src_ld + id_two + src_offset]; } } // Stores the result in the destination matrix dest[id_two*dest_ld + id_one + dest_offset] = result; } } } } #endif // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/convert_triangular.opencl000066400000000000000000000103061463263031500235460ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains kernels to convert triangular matrices to/from general matrices. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= #if defined(ROUTINE_TRMM) // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void TriaLowerToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_dim, const int dest_ld, const int dest_offset, __global real* dest, const int unit_diagonal) { // Loops over the work per thread in both dimensions #pragma unroll for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_dim && id_one < dest_dim) { // Loads data from the lower-triangular matrix real result; SetToZero(result); if (id_two < src_dim && id_one < src_dim) { if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; } if (id_two == id_one && unit_diagonal) { SetToOne(result); } // Else: result is zero } // Stores the result in the destination matrix dest[id_two*dest_ld + id_one + dest_offset] = result; } } } } // Same as above, but now the matrix' data is stored in the upper-triangle #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void TriaUpperToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_dim, const int dest_ld, const int dest_offset, __global real* dest, const int unit_diagonal) { // Loops over the work per thread in both dimensions #pragma unroll for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_dim && id_one < dest_dim) { // Loads data from the upper-triangular matrix real result; SetToZero(result); if (id_two < src_dim && id_one < src_dim) { if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; } if (id_one == id_two && unit_diagonal) { SetToOne(result); } // Else: result is zero } // Stores the result in the destination matrix dest[id_two*dest_ld + id_one + dest_offset] = result; } } } } #endif // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/copy_fast.opencl000066400000000000000000000074341463263031500216350ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the common kernels shared among different BLAS routines. This file contains // kernels to copy matrices. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Data-widths #if COPY_VW == 1 typedef real realC; #elif COPY_VW == 2 typedef real2 realC; #elif COPY_VW == 4 typedef real4 realC; #elif COPY_VW == 8 typedef real8 realC; #elif COPY_VW == 16 typedef real16 realC; #endif // ================================================================================================= // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of // COPY_VW. Also requires both matrices to be of the same dimensions and without offset. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) #endif void CopyMatrixFast(const int ld, __global const realC* restrict src, __global realC* dest, const real_arg arg_alpha) { const real alpha = GetRealArg(arg_alpha); #pragma unroll for (int _w_one = 0; _w_one < COPY_WPT; _w_one += 1) { const int id_one = get_global_id(0); const int id_two = (get_group_id(1)*COPY_WPT + _w_one) * COPY_DIMY + get_local_id(1); const int id = id_two*(ld/COPY_VW) + id_one; realC result; #if COPY_VW == 1 Multiply(result, alpha, src[id]); #elif COPY_VW == 2 Multiply(result.x, alpha, src[id].x); Multiply(result.y, alpha, src[id].y); #elif COPY_VW == 4 Multiply(result.x, alpha, src[id].x); Multiply(result.y, alpha, src[id].y); Multiply(result.z, alpha, src[id].z); Multiply(result.w, alpha, src[id].w); #elif COPY_VW == 8 Multiply(result.s0, alpha, src[id].s0); Multiply(result.s1, alpha, src[id].s1); Multiply(result.s2, alpha, src[id].s2); Multiply(result.s3, alpha, src[id].s3); Multiply(result.s4, alpha, src[id].s4); Multiply(result.s5, alpha, src[id].s5); Multiply(result.s6, alpha, src[id].s6); Multiply(result.s7, alpha, src[id].s7); #elif COPY_VW == 16 Multiply(result.s0, alpha, src[id].s0); Multiply(result.s1, alpha, src[id].s1); Multiply(result.s2, alpha, src[id].s2); Multiply(result.s3, alpha, src[id].s3); Multiply(result.s4, alpha, src[id].s4); Multiply(result.s5, alpha, src[id].s5); Multiply(result.s6, alpha, src[id].s6); Multiply(result.s7, alpha, src[id].s7); Multiply(result.s8, alpha, src[id].s8); Multiply(result.s9, alpha, src[id].s9); Multiply(result.sA, alpha, src[id].sA); Multiply(result.sB, alpha, src[id].sB); Multiply(result.sC, alpha, src[id].sC); Multiply(result.sD, alpha, src[id].sD); Multiply(result.sE, alpha, src[id].sE); Multiply(result.sF, alpha, src[id].sF); #endif dest[id] = result;; } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/copy_pad.opencl000066400000000000000000000254621463263031500214450ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the common kernels shared among different BLAS functions. This file contains // kernels to copy and pad matrices in various ways, including: // 1) copying into a larger matrix by adding padding // 2) copying into a smaller matrix by optionally removing padding. This is the general version // without restrictions, see the 'copy.opencl' file for a faster but more restricted copy kernel. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Copies a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld // value and offset can be different. INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, __global real* dest, const real alpha, const int do_conjugate) { // Loops over the work per thread in both dimensions #pragma unroll for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); if (id_two < dest_two && id_one < dest_one) { // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the // value to be written to zero. real value; SetToZero(value); if (id_two < src_two && id_one < src_one) { value = src[id_two*src_ld + id_one + src_offset]; } // Stores the value in the destination matrix if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } Multiply(dest[id_two*dest_ld + id_one + dest_offset], alpha, value); } } } } // Interface to the above function #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void CopyPadMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, __global real* dest, const real_arg arg_alpha, const int do_conjugate) { const real alpha = GetRealArg(arg_alpha); _CopyPadMatrix(src_one, src_two, src_ld, src_offset, src, dest_one, dest_two, dest_ld, dest_offset, dest, alpha, do_conjugate); } // ================================================================================================= // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but // writes only the actual data back to the destination matrix. Again, the ld value and offset can // be different. INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, __global real* dest, const real alpha, const int upper, const int lower, const int diagonal_imag_zero) { // Loops over the work per thread in both dimensions #pragma unroll for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) { const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0); #pragma unroll for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) { const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1); // Masking in case of triangular matrices: updates only the upper or lower part bool condition = true; #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) if (upper == 1) { condition = (id_two >= id_one); } else if (lower == 1) { condition = (id_two <= id_one); } #endif if (condition) { // Copies the value into the destination matrix. This is always within bounds of the source // matrix, as we know that the destination matrix is smaller or equal to the source. if (id_two < dest_two && id_one < dest_one) { real value = src[id_two*src_ld + id_one + src_offset]; if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); } Multiply(dest[id_two*dest_ld + id_one + dest_offset], alpha, value); } } } } } // Interface to the above function #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void CopyMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, __global real* dest, const real_arg arg_alpha, const int upper, const int lower, const int diagonal_imag_zero) { const real alpha = GetRealArg(arg_alpha); _CopyMatrix(src_one, src_two, src_ld, src_offset, src, dest_one, dest_two, dest_ld, dest_offset, dest, alpha, upper, lower, diagonal_imag_zero); } // ================================================================================================= #if defined(ROUTINE_GEMMBATCHED) // Batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void CopyPadMatrixBatched(const int src_one, const int src_two, const int src_ld, const __constant int* src_offsets, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const __constant int* dest_offsets, __global real* dest, const int do_conjugate) { const int batch = get_group_id(2); const int src_offset = src_offsets[batch]; const int dest_offset = dest_offsets[batch]; real alpha; SetToOne(alpha); _CopyPadMatrix(src_one, src_two, src_ld, src_offset, src, dest_one, dest_two, dest_ld, dest_offset, dest, alpha, do_conjugate); } // Batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void CopyMatrixBatched(const int src_one, const int src_two, const int src_ld, const __constant int* src_offsets, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const __constant int* dest_offsets, __global real* dest) { const int batch = get_group_id(2); const int src_offset = src_offsets[batch]; const int dest_offset = dest_offsets[batch]; real alpha; SetToOne(alpha); _CopyMatrix(src_one, src_two, src_ld, src_offset, src, dest_one, dest_two, dest_ld, dest_offset, dest, alpha, 0, 0, 0); } #endif // ================================================================================================= #if defined(ROUTINE_GEMMSTRIDEDBATCHED) // Strided-batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void CopyPadMatrixStridedBatched(const int src_one, const int src_two, const int src_ld, const int src_offset, const int src_stride, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, const int dest_stride, __global real* dest, const int do_conjugate) { const int batch = get_group_id(2); const int src_offset_batch = src_offset + src_stride * batch; const int dest_offset_batch = dest_offset + dest_stride * batch; real alpha; SetToOne(alpha); _CopyPadMatrix(src_one, src_two, src_ld, src_offset_batch, src, dest_one, dest_two, dest_ld, dest_offset_batch, dest, alpha, do_conjugate); } // Strided-batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void CopyMatrixStridedBatched(const int src_one, const int src_two, const int src_ld, const int src_offset, const int src_stride, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, const int dest_stride, __global real* dest) { const int batch = get_group_id(2); const int src_offset_batch = src_offset + src_stride * batch; const int dest_offset_batch = dest_offset + dest_stride * batch; real alpha; SetToOne(alpha); _CopyMatrix(src_one, src_two, src_ld, src_offset_batch, src, dest_one, dest_two, dest_ld, dest_offset_batch, dest, alpha, 0, 0, 0); } #endif // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/invert_diagonal_blocks_part1.opencl000066400000000000000000000343551463263031500254610ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains kernels to invert squared diagonal blocks of a matrix. These kernels are based // on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular // Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, // and Jack Dongarra. // // This is part 1 of 2, see part 2 for the remainder of the kernel code. // // ================================================================================================= // // Let A be an block_size*block_size lower triangular matrix, and B its inverse. // Then the block decomposition // // [ A11 0 ] * [ B11 0 ] = [ I 0 ] // [ A21 A22 ] [ B21 B22 ] [ 0 I ] // // yields // // A11*B11 = I ==> B11 = A11^{-1}, // A22*B22 = I ==> B22 = A22^{-1}, // A21*B11 + A22*B21 = 0 ==> B21 = -A22^{-1}*A21*B11 = -B22*A21*B11. // // The InvertDiagonalBlock kernel inverts A11 and A22. // The TripleMatMul routines multiply: // part 1: B21 = A21 * B11, // part 2: B21 = -B22 * B21. // // At this level, inner block is current_size=16, with one 4 x 4 work-group per inner block. Each // submatrix Aij and Bij is current_size x current_size. The submatrix dimension is multiplied by 2 // at each level, so the next level is current_size*2 = 32. A 'page' is the next bigger block, // here current_size*2=32, // [ B11 0 ] // which contains [ B21 B22 ]. // Outer blocks are block_size x block_size. // // A21 may have < current_size rows, but is guaranteed to have current_size cols since A22 is on // the right. This makes a single check easy to do. // // B is stored in workspace that is a full multiple of block_size x block_size; no checks needed. // // We split this into part1 & part2 to synchronize all blocks and make sure // that writes to B12 are observed by all blocks. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= #if defined(ROUTINE_INVERT) // Parameters set by the tuner // TODO: Make these actually tunable #ifndef INTERNAL_BLOCK_SIZE #define INTERNAL_BLOCK_SIZE 16 // Internal block size of the invert kernel #endif #ifndef LOCALPAD #define LOCALPAD 0 // Padding in the x-dimension of the local memory to avoid bank conflicts #endif #ifndef LOCALX #define LOCALX (16 + LOCALPAD) // Local memory size in x-dimension of TripleMatMul kernels #endif #ifndef LOCALY #define LOCALY 16 // Local memory size in y-dimension of TripleMatMul kernels #endif #ifndef TMMWGSX #define TMMWGSX 4 // Work-group size in x-dimension of TripleMatMul kernels #endif #ifndef TMMWGSY #define TMMWGSY 4 // Work-group size in y-dimension of TripleMatMul kernels #endif // ================================================================================================= // Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1))) #endif void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld, __global real* restrict dest, const int outer_block_size, const int unit_diagonal, const int is_upper) { const int thread_index = get_local_id(0); const int block_index = get_group_id(0); // Sets the offset for this particular block in the source and destination matrices const int block_index_per_block = block_index * INTERNAL_BLOCK_SIZE; const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset; const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE; const int block_index_div = block_index / num_inner_blocks; const int block_index_mod = block_index % num_inner_blocks; const int offset_part1 = block_index_div * outer_block_size * outer_block_size; // go to the block_index_div outer outer_block_size*outer_block_size block const int offset_part2 = block_index_mod * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the block_index_mod inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that const int dest_block_offset = offset_part1 + offset_part2; // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; // Loads the source lower triangle into local memory. Any values in the upper triangle or // outside of the matrix are set to zero for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) { bool condition = false; if (is_upper) { condition = (thread_index <= _j) && (block_index_per_block + _j < n); } else { condition = (thread_index >= _j) && (block_index_per_block + thread_index < n); } if (condition) { const int src_index = _j*src_ld + thread_index + src_block_offset; lm[thread_index][_j] = src[src_index]; } else { SetToZero(lm[thread_index][_j]); } } barrier(CLK_LOCAL_MEM_FENCE); // Inverts the diagonal real inverted_diagonal; SetToOne(inverted_diagonal); if (unit_diagonal == 0) { const real diagonal_value = lm[thread_index][thread_index]; if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix real constant_one; SetToOne(constant_one); DivideFull(inverted_diagonal, constant_one, diagonal_value); } } lm[thread_index][thread_index] = inverted_diagonal; barrier(CLK_LOCAL_MEM_FENCE); // Upper-triangular if (is_upper) { // Computes the elements 0:j-1 of the j-th column for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) { real sum; if (thread_index < j) { SetToZero(sum); for (int k = 0; k < j; ++k) { MultiplyAdd(sum, lm[thread_index][k], lm[k][j]); } } barrier(CLK_LOCAL_MEM_FENCE); if (thread_index < j) { real diagonal_value = lm[j][j]; Negate(diagonal_value); Multiply(lm[thread_index][j], diagonal_value, sum); } barrier(CLK_LOCAL_MEM_FENCE); } } // Lower triangular else { // Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) { real sum; if (thread_index > j) { SetToZero(sum); for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) { MultiplyAdd(sum, lm[thread_index][k], lm[k][j]); } } barrier(CLK_LOCAL_MEM_FENCE); if (thread_index > j) { real diagonal_value = lm[j][j]; Negate(diagonal_value); Multiply(lm[thread_index][j], diagonal_value, sum); } barrier(CLK_LOCAL_MEM_FENCE); } } // Writes the result to global memory #pragma unroll for (int j = 0; j < INTERNAL_BLOCK_SIZE; j += 1) { dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j]; } } // ================================================================================================= // Triple matrix-multiplication kernel: C = A * B INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, LOCAL_PTR real* blm, int n, __global const real* agm, __global const real* bgm, __global real* cgm, const int lda, const int ldb, const int ldc, int current_size, int num_pages, const int block_size) { // Emulates a 3D grid: NX * (NY * num_pages) const int by = get_group_id(1) / num_pages; const int page = get_group_id(1) % num_pages; const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int ibx = get_group_id(0) * (get_local_size(0) * TMMWGSY); const int iby = by*16; const int id = lidx + lidy*get_local_size(0); const int row = page*current_size*2 + current_size + ibx + id; int col = page*current_size*2 + current_size; // Sets the offsets for this specific thread agm += ibx + id; bgm += lidx + (iby + lidy)*ldb; cgm += ibx + id + iby*ldc; // Initializes the result registers real cpm[16]; #pragma unroll for (int _j = 0; _j < 16; _j += 1) { SetToZero(cpm[_j]); } // Computes NT x 16 block of C, each thread computes one 1 x 16 row for (int k = 0; k < current_size; k += 16) { // Loads a 16 x 16 block of B into local memory using NX x 4 threads for (int i = 0; i < 16; i += (size/4) ) { // += get_local_size(0) for (int _j = 0; _j < 16; _j += TMMWGSY ) { // += get_local_size(1) blm[(lidx + i) * LOCALX + (lidy + _j)] = bgm[k + i + _j*ldb]; } } barrier(CLK_LOCAL_MEM_FENCE); // Upper triangular if (upper) { // Performs 16 x 16 multiply-add operations #pragma unroll for (int _i = 0; _i < 16; _i += 1) { if (part == 2 || col++ < n) { #pragma unroll for (int _j = 0; _j < 16; _j += 1) { MultiplyAdd(cpm[_j], agm[(_i + k) * lda], blm[_i * LOCALX + _j]); } } } } // Lower triangular else { if (row < n) { // Performs 16 x 16 multiply-add operations #pragma unroll for (int _i = 0; _i < 16; _i += 1) { #pragma unroll for (int _j = 0; _j < 16; _j += 1) { MultiplyAdd(cpm[_j], agm[(_i + k) * lda], blm[_i * LOCALX + _j]); } } } } barrier(CLK_LOCAL_MEM_FENCE); } // Stores NT x 16 results: each thread writes one 16 x 1 row #pragma unroll for (int _i = 0; _i < 16; _i += 1) { if (part == 2) { Negate(cpm[_i]); } cgm[0] = cpm[_i]; cgm += ldc; } } // ================================================================================================= // Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower) INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR real* blm, int n, __global const real* src, const int a_offset, const int lda, __global real* dest, int current_size, int num_pages, const int block_size) { // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; // Computes the destination block offset: // - go to the (page / pages_per_block) outer block_size * block_size block // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that const int pages_per_block = block_size / (current_size*2); dest += (page / pages_per_block) * block_size * block_size + (page % pages_per_block) * (current_size*2*block_size + current_size*2); // Using the GEMM notation: C = A*B __global const real* agm; __global const real* bgm; __global real* cgm; if (upper) { // upper triangular: B12 = A12 * B22 agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size*lda; // A12 bgm = dest + current_size*block_size + current_size; // B22 cgm = dest + current_size*block_size; // B12 } else { // lower triangular: B21 = A21 * B11 agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size; // A21 bgm = dest; // B11 cgm = dest + current_size; // B21 } // Runs the generic C = A * B matrix multiplication const int ldb = block_size; const int ldc = block_size; TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); } // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower) INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n, __global real* dest, int current_size, int num_pages, const int block_size) { // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; // Computes the destination block offset: // - go to the (page / pages_per_block) outer block_size * block_size block // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that const int pages_per_block = block_size / (current_size*2); dest += (page / pages_per_block) * block_size * block_size + (page % pages_per_block) * (current_size*2*block_size + current_size*2); // Using the GEMM notation: C = A*B __global const real* agm; __global const real* bgm; __global real* cgm; if (upper) { // upper triangular: B12 = -B11 * B12 agm = dest; // B11 cgm = dest + current_size*block_size; // B12 bgm = cgm; // B12, okay to overwrite } else { // lower triangular: B21 = -B22 * B21 agm = dest + current_size*block_size + current_size; // B22 cgm = dest + current_size; // B21 bgm = cgm; // B21, okay to overwrite } // Runs the generic C = A * B matrix multiplication const int lda = block_size; const int ldb = block_size; const int ldc = block_size; TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); } #endif // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/invert_diagonal_blocks_part2.opencl000066400000000000000000000121741463263031500254550ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This is part 2 of 2, see part 1 of the invert kernel for a description // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= #if defined(ROUTINE_INVERT) // B21 = A21 * B11 __kernel void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); } // B21 = -B22 * B21 __kernel void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size); } // B21 = A21 * B11 __kernel void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); } // B21 = -B22 * B21 __kernel void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size); } // B21 = A21 * B11 __kernel void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); } // B21 = -B22 * B21 __kernel void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size); } // ================================================================================================= // B12 = A12 * B22 __kernel void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); } // B12 = -B11 * B12 __kernel void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size); } // B12 = A12 * B22 __kernel void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); } // B12 = -B11 * B12 __kernel void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size); } // B12 = A12 * B22 __kernel void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); } // B12 = -B11 * B12 __kernel void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size); } #endif // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/level3.opencl000066400000000000000000000067031463263031500210360ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the common functions and parameters specific for level 3 BLAS kernels. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. // For the 'fast' copy kernel #ifndef COPY_DIMX #define COPY_DIMX 8 // Local workgroup size in the first dimension (x) #endif #ifndef COPY_DIMY #define COPY_DIMY 8 // Local workgroup size in the second dimension (y) #endif #ifndef COPY_WPT #define COPY_WPT 1 // Work per thread in the first dimension (x) #endif #ifndef COPY_VW #define COPY_VW 1 // Vector width in the second dimension (y) #endif // For the padding/copy kernels and the conversion kernels #ifndef PAD_DIMX #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) #endif #ifndef PAD_DIMY #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) #endif #ifndef PAD_WPTX #define PAD_WPTX 1 // Work per thread in the first dimension (x) #endif #ifndef PAD_WPTY #define PAD_WPTY 1 // Work per thread in the second dimension (y) #endif // For the 'fast' transpose kernel #ifndef TRA_DIM #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y) #endif #ifndef TRA_WPT #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other #endif #ifndef TRA_PAD #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts #endif #ifndef TRA_SHUFFLE #define TRA_SHUFFLE 0 // Shuffling of the global indices to avoid global memory bank-conflicts #endif // For the padding/transpose kernels #ifndef PADTRA_TILE #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y) #endif #ifndef PADTRA_WPT #define PADTRA_WPT 1 // Amount of work per thread #endif #ifndef PADTRA_PAD #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts #endif // ================================================================================================= #if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM) __kernel void FillMatrix(const int m, const int n, const int ld, const int offset, __global real* restrict dest, const real_arg arg_value) { const real value = GetRealArg(arg_value); const int id_one = get_global_id(0); const int id_two = get_global_id(1); if (id_one < m && id_two < n) { dest[id_two*ld + id_one + offset] = value; } } #endif // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/transpose_fast.opencl000066400000000000000000000362471463263031500227050ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the common kernels shared among different BLAS functions. This file contains // a kernel to transpose matrices. This is a 'fast' version with restrictions, see the // 'padtranspose.opencl' file for a general transpose kernel. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Data-widths #if TRA_WPT == 1 typedef real realT; #elif TRA_WPT == 2 typedef real2 realT; #elif TRA_WPT == 4 typedef real4 realT; #elif TRA_WPT == 8 typedef real8 realT; #elif TRA_WPT == 16 typedef real16 realT; #endif // ================================================================================================= // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without // offset. A more general version is available in 'padtranspose.opencl'. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) #endif void TransposeMatrixFast(const int ld, __global const realT* restrict src, __global realT* dest, const real_arg arg_alpha) { const real alpha = GetRealArg(arg_alpha); // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different // way over workgroups, breaking memory-bank dependencies. const int gid0 = get_group_id(0); #if TRA_SHUFFLE == 1 const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); #else const int gid1 = get_group_id(1); #endif // Local memory to store a tile of the matrix (for coalescing) __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD]; // Loops over the work per thread #pragma unroll for (int _w_one = 0; _w_one < TRA_WPT; _w_one += 1) { // Computes the identifiers for the source matrix. Note that the local and global dimensions // do not correspond to each other! const int id_one = gid1 * TRA_DIM + get_local_id(0); const int id_two = (gid0 * TRA_DIM + get_local_id(1))*TRA_WPT + _w_one; // Loads data into the local memory realT value = src[id_two*(ld/TRA_WPT) + id_one]; tile[get_local_id(0)*TRA_WPT + _w_one][get_local_id(1)] = value; } // Synchronizes all threads in a workgroup barrier(CLK_LOCAL_MEM_FENCE); // Loads transposed data from the local memory #pragma promote_to_registers realT vpm[TRA_WPT]; #pragma unroll for (int _w_one = 0; _w_one < TRA_WPT; _w_one += 1) { vpm[_w_one] = tile[get_local_id(1)*TRA_WPT + _w_one][get_local_id(0)]; } // Performs the register-level transpose of the vectorized data #pragma promote_to_registers realT results[TRA_WPT]; #if TRA_WPT == 1 results[0] = vpm[0]; #elif TRA_WPT == 2 results[0].x = vpm[0].x; results[0].y = vpm[1].x; results[1].x = vpm[0].y; results[1].y = vpm[1].y; #elif TRA_WPT == 4 results[0].x = vpm[0].x; results[0].y = vpm[1].x; results[0].z = vpm[2].x; results[0].w = vpm[3].x; results[1].x = vpm[0].y; results[1].y = vpm[1].y; results[1].z = vpm[2].y; results[1].w = vpm[3].y; results[2].x = vpm[0].z; results[2].y = vpm[1].z; results[2].z = vpm[2].z; results[2].w = vpm[3].z; results[3].x = vpm[0].w; results[3].y = vpm[1].w; results[3].z = vpm[2].w; results[3].w = vpm[3].w; #elif TRA_WPT == 8 results[0].s0 = vpm[0].s0; results[0].s1 = vpm[1].s0; results[0].s2 = vpm[2].s0; results[0].s3 = vpm[3].s0; results[0].s4 = vpm[4].s0; results[0].s5 = vpm[5].s0; results[0].s6 = vpm[6].s0; results[0].s7 = vpm[7].s0; results[1].s0 = vpm[0].s1; results[1].s1 = vpm[1].s1; results[1].s2 = vpm[2].s1; results[1].s3 = vpm[3].s1; results[1].s4 = vpm[4].s1; results[1].s5 = vpm[5].s1; results[1].s6 = vpm[6].s1; results[1].s7 = vpm[7].s1; results[2].s0 = vpm[0].s2; results[2].s1 = vpm[1].s2; results[2].s2 = vpm[2].s2; results[2].s3 = vpm[3].s2; results[2].s4 = vpm[4].s2; results[2].s5 = vpm[5].s2; results[2].s6 = vpm[6].s2; results[2].s7 = vpm[7].s2; results[3].s0 = vpm[0].s3; results[3].s1 = vpm[1].s3; results[3].s2 = vpm[2].s3; results[3].s3 = vpm[3].s3; results[3].s4 = vpm[4].s3; results[3].s5 = vpm[5].s3; results[3].s6 = vpm[6].s3; results[3].s7 = vpm[7].s3; results[4].s0 = vpm[0].s4; results[4].s1 = vpm[1].s4; results[4].s2 = vpm[2].s4; results[4].s3 = vpm[3].s4; results[4].s4 = vpm[4].s4; results[4].s5 = vpm[5].s4; results[4].s6 = vpm[6].s4; results[4].s7 = vpm[7].s4; results[5].s0 = vpm[0].s5; results[5].s1 = vpm[1].s5; results[5].s2 = vpm[2].s5; results[5].s3 = vpm[3].s5; results[5].s4 = vpm[4].s5; results[5].s5 = vpm[5].s5; results[5].s6 = vpm[6].s5; results[5].s7 = vpm[7].s5; results[6].s0 = vpm[0].s6; results[6].s1 = vpm[1].s6; results[6].s2 = vpm[2].s6; results[6].s3 = vpm[3].s6; results[6].s4 = vpm[4].s6; results[6].s5 = vpm[5].s6; results[6].s6 = vpm[6].s6; results[6].s7 = vpm[7].s6; results[7].s0 = vpm[0].s7; results[7].s1 = vpm[1].s7; results[7].s2 = vpm[2].s7; results[7].s3 = vpm[3].s7; results[7].s4 = vpm[4].s7; results[7].s5 = vpm[5].s7; results[7].s6 = vpm[6].s7; results[7].s7 = vpm[7].s7; #elif TRA_WPT == 16 results[ 0].s0 = vpm[0].s0; results[ 0].s1 = vpm[1].s0; results[ 0].s2 = vpm[2].s0; results[ 0].s3 = vpm[3].s0; results[ 0].s4 = vpm[4].s0; results[ 0].s5 = vpm[5].s0; results[ 0].s6 = vpm[6].s0; results[ 0].s7 = vpm[7].s0; results[ 0].s8 = vpm[8].s0; results[ 0].s9 = vpm[9].s0; results[ 0].sA = vpm[10].s0; results[ 0].sB = vpm[11].s0; results[ 0].sC = vpm[12].s0; results[ 0].sD = vpm[13].s0; results[ 0].sE = vpm[14].s0; results[ 0].sF = vpm[15].s0; results[ 1].s0 = vpm[0].s1; results[ 1].s1 = vpm[1].s1; results[ 1].s2 = vpm[2].s1; results[ 1].s3 = vpm[3].s1; results[ 1].s4 = vpm[4].s1; results[ 1].s5 = vpm[5].s1; results[ 1].s6 = vpm[6].s1; results[ 1].s7 = vpm[7].s1; results[ 1].s8 = vpm[8].s1; results[ 1].s9 = vpm[9].s1; results[ 1].sA = vpm[10].s1; results[ 1].sB = vpm[11].s1; results[ 1].sC = vpm[12].s1; results[ 1].sD = vpm[13].s1; results[ 1].sE = vpm[14].s1; results[ 1].sF = vpm[15].s1; results[ 2].s0 = vpm[0].s2; results[ 2].s1 = vpm[1].s2; results[ 2].s2 = vpm[2].s2; results[ 2].s3 = vpm[3].s2; results[ 2].s4 = vpm[4].s2; results[ 2].s5 = vpm[5].s2; results[ 2].s6 = vpm[6].s2; results[ 2].s7 = vpm[7].s2; results[ 2].s8 = vpm[8].s2; results[ 2].s9 = vpm[9].s2; results[ 2].sA = vpm[10].s2; results[ 2].sB = vpm[11].s2; results[ 2].sC = vpm[12].s2; results[ 2].sD = vpm[13].s2; results[ 2].sE = vpm[14].s2; results[ 2].sF = vpm[15].s2; results[ 3].s0 = vpm[0].s3; results[ 3].s1 = vpm[1].s3; results[ 3].s2 = vpm[2].s3; results[ 3].s3 = vpm[3].s3; results[ 3].s4 = vpm[4].s3; results[ 3].s5 = vpm[5].s3; results[ 3].s6 = vpm[6].s3; results[ 3].s7 = vpm[7].s3; results[ 3].s8 = vpm[8].s3; results[ 3].s9 = vpm[9].s3; results[ 3].sA = vpm[10].s3; results[ 3].sB = vpm[11].s3; results[ 3].sC = vpm[12].s3; results[ 3].sD = vpm[13].s3; results[ 3].sE = vpm[14].s3; results[ 3].sF = vpm[15].s3; results[ 4].s0 = vpm[0].s4; results[ 4].s1 = vpm[1].s4; results[ 4].s2 = vpm[2].s4; results[ 4].s3 = vpm[3].s4; results[ 4].s4 = vpm[4].s4; results[ 4].s5 = vpm[5].s4; results[ 4].s6 = vpm[6].s4; results[ 4].s7 = vpm[7].s4; results[ 4].s8 = vpm[8].s4; results[ 4].s9 = vpm[9].s4; results[ 4].sA = vpm[10].s4; results[ 4].sB = vpm[11].s4; results[ 4].sC = vpm[12].s4; results[ 4].sD = vpm[13].s4; results[ 4].sE = vpm[14].s4; results[ 4].sF = vpm[15].s4; results[ 5].s0 = vpm[0].s5; results[ 5].s1 = vpm[1].s5; results[ 5].s2 = vpm[2].s5; results[ 5].s3 = vpm[3].s5; results[ 5].s4 = vpm[4].s5; results[ 5].s5 = vpm[5].s5; results[ 5].s6 = vpm[6].s5; results[ 5].s7 = vpm[7].s5; results[ 5].s8 = vpm[8].s5; results[ 5].s9 = vpm[9].s5; results[ 5].sA = vpm[10].s5; results[ 5].sB = vpm[11].s5; results[ 5].sC = vpm[12].s5; results[ 5].sD = vpm[13].s5; results[ 5].sE = vpm[14].s5; results[ 5].sF = vpm[15].s5; results[ 6].s0 = vpm[0].s6; results[ 6].s1 = vpm[1].s6; results[ 6].s2 = vpm[2].s6; results[ 6].s3 = vpm[3].s6; results[ 6].s4 = vpm[4].s6; results[ 6].s5 = vpm[5].s6; results[ 6].s6 = vpm[6].s6; results[ 6].s7 = vpm[7].s6; results[ 6].s8 = vpm[8].s6; results[ 6].s9 = vpm[9].s6; results[ 6].sA = vpm[10].s6; results[ 6].sB = vpm[11].s6; results[ 6].sC = vpm[12].s6; results[ 6].sD = vpm[13].s6; results[ 6].sE = vpm[14].s6; results[ 6].sF = vpm[15].s6; results[ 7].s0 = vpm[0].s7; results[ 7].s1 = vpm[1].s7; results[ 7].s2 = vpm[2].s7; results[ 7].s3 = vpm[3].s7; results[ 7].s4 = vpm[4].s7; results[ 7].s5 = vpm[5].s7; results[ 7].s6 = vpm[6].s7; results[ 7].s7 = vpm[7].s7; results[ 7].s8 = vpm[8].s7; results[ 7].s9 = vpm[9].s7; results[ 7].sA = vpm[10].s7; results[ 7].sB = vpm[11].s7; results[ 7].sC = vpm[12].s7; results[ 7].sD = vpm[13].s7; results[ 7].sE = vpm[14].s7; results[ 7].sF = vpm[15].s7; results[ 8].s0 = vpm[0].s8; results[ 8].s1 = vpm[1].s8; results[ 8].s2 = vpm[2].s8; results[ 8].s3 = vpm[3].s8; results[ 8].s4 = vpm[4].s8; results[ 8].s5 = vpm[5].s8; results[ 8].s6 = vpm[6].s8; results[ 8].s7 = vpm[7].s8; results[ 8].s8 = vpm[8].s8; results[ 8].s9 = vpm[9].s8; results[ 8].sA = vpm[10].s8; results[ 8].sB = vpm[11].s8; results[ 8].sC = vpm[12].s8; results[ 8].sD = vpm[13].s8; results[ 8].sE = vpm[14].s8; results[ 8].sF = vpm[15].s8; results[ 9].s0 = vpm[0].s9; results[ 9].s1 = vpm[1].s9; results[ 9].s2 = vpm[2].s9; results[ 9].s3 = vpm[3].s9; results[ 9].s4 = vpm[4].s9; results[ 9].s5 = vpm[5].s9; results[ 9].s6 = vpm[6].s9; results[ 9].s7 = vpm[7].s9; results[ 9].s8 = vpm[8].s9; results[ 9].s9 = vpm[9].s9; results[ 9].sA = vpm[10].s9; results[ 9].sB = vpm[11].s9; results[ 9].sC = vpm[12].s9; results[ 9].sD = vpm[13].s9; results[ 9].sE = vpm[14].s9; results[ 9].sF = vpm[15].s9; results[10].s0 = vpm[0].sA; results[10].s1 = vpm[1].sA; results[10].s2 = vpm[2].sA; results[10].s3 = vpm[3].sA; results[10].s4 = vpm[4].sA; results[10].s5 = vpm[5].sA; results[10].s6 = vpm[6].sA; results[10].s7 = vpm[7].sA; results[10].s8 = vpm[8].sA; results[10].s9 = vpm[9].sA; results[10].sA = vpm[10].sA; results[10].sB = vpm[11].sA; results[10].sC = vpm[12].sA; results[10].sD = vpm[13].sA; results[10].sE = vpm[14].sA; results[10].sF = vpm[15].sA; results[11].s0 = vpm[0].sB; results[11].s1 = vpm[1].sB; results[11].s2 = vpm[2].sB; results[11].s3 = vpm[3].sB; results[11].s4 = vpm[4].sB; results[11].s5 = vpm[5].sB; results[11].s6 = vpm[6].sB; results[11].s7 = vpm[7].sB; results[11].s8 = vpm[8].sB; results[11].s9 = vpm[9].sB; results[11].sA = vpm[10].sB; results[11].sB = vpm[11].sB; results[11].sC = vpm[12].sB; results[11].sD = vpm[13].sB; results[11].sE = vpm[14].sB; results[11].sF = vpm[15].sB; results[12].s0 = vpm[0].sC; results[12].s1 = vpm[1].sC; results[12].s2 = vpm[2].sC; results[12].s3 = vpm[3].sC; results[12].s4 = vpm[4].sC; results[12].s5 = vpm[5].sC; results[12].s6 = vpm[6].sC; results[12].s7 = vpm[7].sC; results[12].s8 = vpm[8].sC; results[12].s9 = vpm[9].sC; results[12].sA = vpm[10].sC; results[12].sB = vpm[11].sC; results[12].sC = vpm[12].sC; results[12].sD = vpm[13].sC; results[12].sE = vpm[14].sC; results[12].sF = vpm[15].sC; results[13].s0 = vpm[0].sD; results[13].s1 = vpm[1].sD; results[13].s2 = vpm[2].sD; results[13].s3 = vpm[3].sD; results[13].s4 = vpm[4].sD; results[13].s5 = vpm[5].sD; results[13].s6 = vpm[6].sD; results[13].s7 = vpm[7].sD; results[13].s8 = vpm[8].sD; results[13].s9 = vpm[9].sD; results[13].sA = vpm[10].sD; results[13].sB = vpm[11].sD; results[13].sC = vpm[12].sD; results[13].sD = vpm[13].sD; results[13].sE = vpm[14].sD; results[13].sF = vpm[15].sD; results[14].s0 = vpm[0].sE; results[14].s1 = vpm[1].sE; results[14].s2 = vpm[2].sE; results[14].s3 = vpm[3].sE; results[14].s4 = vpm[4].sE; results[14].s5 = vpm[5].sE; results[14].s6 = vpm[6].sE; results[14].s7 = vpm[7].sE; results[14].s8 = vpm[8].sE; results[14].s9 = vpm[9].sE; results[14].sA = vpm[10].sE; results[14].sB = vpm[11].sE; results[14].sC = vpm[12].sE; results[14].sD = vpm[13].sE; results[14].sE = vpm[14].sE; results[14].sF = vpm[15].sE; results[15].s0 = vpm[0].sF; results[15].s1 = vpm[1].sF; results[15].s2 = vpm[2].sF; results[15].s3 = vpm[3].sF; results[15].s4 = vpm[4].sF; results[15].s5 = vpm[5].sF; results[15].s6 = vpm[6].sF; results[15].s7 = vpm[7].sF; results[15].s8 = vpm[8].sF; results[15].s9 = vpm[9].sF; results[15].sA = vpm[10].sF; results[15].sB = vpm[11].sF; results[15].sC = vpm[12].sF; results[15].sD = vpm[13].sF; results[15].sE = vpm[14].sF; results[15].sF = vpm[15].sF; #endif // Multiplies by alpha and then stores the results into the destination matrix #pragma unroll for (int _w_two = 0; _w_two < TRA_WPT; _w_two += 1) { realT result; #if TRA_WPT == 1 Multiply(result, alpha, results[_w_two]); #elif TRA_WPT == 2 Multiply(result.x, alpha, results[_w_two].x); Multiply(result.y, alpha, results[_w_two].y); #elif TRA_WPT == 4 Multiply(result.x, alpha, results[_w_two].x); Multiply(result.y, alpha, results[_w_two].y); Multiply(result.z, alpha, results[_w_two].z); Multiply(result.w, alpha, results[_w_two].w); #elif TRA_WPT == 8 Multiply(result.s0, alpha, results[_w_two].s0); Multiply(result.s1, alpha, results[_w_two].s1); Multiply(result.s2, alpha, results[_w_two].s2); Multiply(result.s3, alpha, results[_w_two].s3); Multiply(result.s4, alpha, results[_w_two].s4); Multiply(result.s5, alpha, results[_w_two].s5); Multiply(result.s6, alpha, results[_w_two].s6); Multiply(result.s7, alpha, results[_w_two].s7); #elif TRA_WPT == 16 Multiply(result.s0, alpha, results[_w_two].s0); Multiply(result.s1, alpha, results[_w_two].s1); Multiply(result.s2, alpha, results[_w_two].s2); Multiply(result.s3, alpha, results[_w_two].s3); Multiply(result.s4, alpha, results[_w_two].s4); Multiply(result.s5, alpha, results[_w_two].s5); Multiply(result.s6, alpha, results[_w_two].s6); Multiply(result.s7, alpha, results[_w_two].s7); Multiply(result.s8, alpha, results[_w_two].s8); Multiply(result.s9, alpha, results[_w_two].s9); Multiply(result.sA, alpha, results[_w_two].sA); Multiply(result.sB, alpha, results[_w_two].sB); Multiply(result.sC, alpha, results[_w_two].sC); Multiply(result.sD, alpha, results[_w_two].sD); Multiply(result.sE, alpha, results[_w_two].sE); Multiply(result.sF, alpha, results[_w_two].sF); #endif const int id_one = gid0*TRA_DIM + get_local_id(0); const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + _w_two; dest[id_two*(ld/TRA_WPT) + id_one] = result; } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/transpose_pad.opencl000066400000000000000000000344131463263031500225050ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the common kernels shared among different BLAS functions. This file contains // kernels to transpose matrices in various ways, including: // 1) transposing into a larger matrix by adding padding // 2) transposing into a smaller matrix by optionally removing padding. This is the general version // without restrictions, see the 'transpose.opencl' file for a faster but more restricted // transpose kernel. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Transposes a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the transposed source matrix dimensions. INLINE_FUNC void _TransposePadMatrix(LOCAL_PTR real* tile, const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, __global real* dest, const real alpha, const int do_conjugate) { // Loop over the work per thread #pragma unroll for (int _w_one = 0; _w_one < PADTRA_WPT; _w_one += 1) { #pragma unroll for (int _w_two = 0; _w_two < PADTRA_WPT; _w_two += 1) { // Computes the identifiers for the source matrix. Note that the local and global dimensions // do not correspond to each other! const int id_src_one = (get_group_id(1)*PADTRA_WPT + _w_two) * PADTRA_TILE + get_local_id(0); const int id_src_two = (get_group_id(0)*PADTRA_WPT + _w_one) * PADTRA_TILE + get_local_id(1); // Loads data into the local memory if the thread IDs are within bounds of the source matrix. // Otherwise, set the local memory value to zero. real value; SetToZero(value); if (id_src_two < src_two && id_src_one < src_one) { value = src[id_src_two*src_ld + id_src_one + src_offset]; } const int tile_id0 = get_local_id(0)*PADTRA_WPT + _w_one; const int tile_id1 = get_local_id(1)*PADTRA_WPT + _w_two; tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0] = value; } } // Synchronizes all threads in a workgroup barrier(CLK_LOCAL_MEM_FENCE); // Loop over the work per thread #pragma unroll for (int _w_one = 0; _w_one < PADTRA_WPT; _w_one += 1) { #pragma unroll for (int _w_two = 0; _w_two < PADTRA_WPT; _w_two += 1) { // Computes the identifiers for the destination matrix const int id_dest_one = (get_group_id(0)*PADTRA_WPT + _w_one) * PADTRA_TILE + get_local_id(0); const int id_dest_two = (get_group_id(1)*PADTRA_WPT + _w_two) * PADTRA_TILE + get_local_id(1); // Stores the transposed value in the destination matrix if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { const int tile_id0 = get_local_id(1)*PADTRA_WPT + _w_one; const int tile_id1 = get_local_id(0)*PADTRA_WPT + _w_two; real value = tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0]; if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value); } } } } // Interface to the above function #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) #endif void TransposePadMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, __global real* dest, const real_arg arg_alpha, const int do_conjugate) { const real alpha = GetRealArg(arg_alpha); __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)]; _TransposePadMatrix(tile, src_one, src_two, src_ld, src_offset, src, dest_one, dest_two, dest_ld, dest_offset, dest, alpha, do_conjugate); } // ================================================================================================= // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a // padded source matrix, but only the actual data is written back to the transposed destination // matrix. This kernel optionally checks for upper/lower triangular matrices. INLINE_FUNC void _TransposeMatrix(LOCAL_PTR real* tile, const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, __global real* dest, const real alpha, const int upper, const int lower, const int diagonal_imag_zero) { // Loop over the work per thread #pragma unroll for (int _w_one = 0; _w_one < PADTRA_WPT; _w_one += 1) { #pragma unroll for (int _w_two = 0; _w_two < PADTRA_WPT; _w_two += 1) { // Computes the identifiers for the source matrix. Note that the local and global dimensions // do not correspond to each other! const int id_src_one = (get_group_id(1)*PADTRA_WPT + _w_two) * PADTRA_TILE + get_local_id(0); const int id_src_two = (get_group_id(0)*PADTRA_WPT + _w_one) * PADTRA_TILE + get_local_id(1); // Loads data into the local memory if the thread IDs are within bounds of the source matrix. if ((id_src_one < src_one) && (id_src_two < src_two)) { real value = src[id_src_two*src_ld + id_src_one + src_offset]; const int tile_id0 = get_local_id(0)*PADTRA_WPT + _w_one; const int tile_id1 = get_local_id(1)*PADTRA_WPT + _w_two; tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0] = value; } } } // Synchronizes all threads in a workgroup barrier(CLK_LOCAL_MEM_FENCE); // Loop over the work per thread #pragma unroll for (int _w_one = 0; _w_one < PADTRA_WPT; _w_one += 1) { #pragma unroll for (int _w_two = 0; _w_two < PADTRA_WPT; _w_two += 1) { // Computes the identifiers for the destination matrix const int id_dest_one = (get_group_id(0)*PADTRA_WPT + _w_one) * PADTRA_TILE + get_local_id(0); const int id_dest_two = (get_group_id(1)*PADTRA_WPT + _w_two) * PADTRA_TILE + get_local_id(1); // Masking in case of triangular matrices: updates only the upper or lower part bool condition = true; #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) if (upper == 1) { condition = (id_dest_one >= id_dest_two); } else if (lower == 1) { condition = (id_dest_one <= id_dest_two); } #endif if (condition) { // Stores the transposed value in the destination matrix if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { const int tile_id0 = get_local_id(1)*PADTRA_WPT + _w_one; const int tile_id1 = get_local_id(0)*PADTRA_WPT + _w_two; real value = tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0]; if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); } Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value); } } } } } // Interface to the above function #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) #endif void TransposeMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, __global real* dest, const real_arg arg_alpha, const int upper, const int lower, const int diagonal_imag_zero) { const real alpha = GetRealArg(arg_alpha); __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)]; _TransposeMatrix(tile, src_one, src_two, src_ld, src_offset, src, dest_one, dest_two, dest_ld, dest_offset, dest, alpha, upper, lower, diagonal_imag_zero); } // ================================================================================================= #if defined(ROUTINE_GEMMBATCHED) // Batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) #endif void TransposePadMatrixBatched(const int src_one, const int src_two, const int src_ld, const __constant int* src_offsets, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const __constant int* dest_offsets, __global real* dest, const int do_conjugate) { const int batch = get_group_id(2); const int src_offset = src_offsets[batch]; const int dest_offset = dest_offsets[batch]; real alpha; SetToOne(alpha); __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)]; _TransposePadMatrix(tile, src_one, src_two, src_ld, src_offset, src, dest_one, dest_two, dest_ld, dest_offset, dest, alpha, do_conjugate); } // Batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) #endif void TransposeMatrixBatched(const int src_one, const int src_two, const int src_ld, const __constant int* src_offsets, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const __constant int* dest_offsets, __global real* dest) { const int batch = get_group_id(2); const int src_offset = src_offsets[batch]; const int dest_offset = dest_offsets[batch]; real alpha; SetToOne(alpha); __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)]; _TransposeMatrix(tile, src_one, src_two, src_ld, src_offset, src, dest_one, dest_two, dest_ld, dest_offset, dest, alpha, 0, 0, 0); } #endif // ================================================================================================= #if defined(ROUTINE_GEMMSTRIDEDBATCHED) // Strided-batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) #endif void TransposePadMatrixStridedBatched(const int src_one, const int src_two, const int src_ld, const int src_offset, const int src_stride, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, const int dest_stride, __global real* dest, const int do_conjugate) { const int batch = get_group_id(2); const int src_offset_batch = src_offset + src_stride * batch; const int dest_offset_batch = dest_offset + dest_stride * batch; real alpha; SetToOne(alpha); __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)]; _TransposePadMatrix(tile, src_one, src_two, src_ld, src_offset_batch, src, dest_one, dest_two, dest_ld, dest_offset_batch, dest, alpha, do_conjugate); } // Strided-batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) #endif void TransposeMatrixStridedBatched(const int src_one, const int src_two, const int src_ld, const int src_offset, const int src_stride, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, const int dest_stride, __global real* dest) { const int batch = get_group_id(2); const int src_offset_batch = src_offset + src_stride * batch; const int dest_offset_batch = dest_offset + dest_stride * batch; real alpha; SetToOne(alpha); __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)]; _TransposeMatrix(tile, src_one, src_two, src_ld, src_offset_batch, src, dest_one, dest_two, dest_ld, dest_offset_batch, dest, alpha, 0, 0, 0); } #endif // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/xgemm_batched.opencl000066400000000000000000000111201463263031500224200ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the batched version of the non-direct GEMM kernel. See part 1 for information // about the non-batched version of the kernel. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= #if defined(ROUTINE_GEMMBATCHED) #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) #endif void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realM* restrict agm, const int a_one, const int a_two, const __global realN* restrict bgm, const int b_one, const int b_two, __global realM* cgm, const int c_one, const int c_two) { const int batch = get_group_id(2); const real alpha = GetRealArg(arg_alphas[batch]); const real beta = GetRealArg(arg_betas[batch]); // Sets the offsets const int a_offset = batch * a_one * a_two; const int b_offset = batch * b_one * b_two; const int c_offset = batch * c_one * c_two; const __global realM* restrict agm_ = &agm[a_offset / VWM]; const __global realN* restrict bgm_ = &bgm[b_offset / VWN]; __global realM* restrict cgm_ = &cgm[c_offset / VWM]; // Allocates workgroup-private memory (local memory) #if SA == 1 __local realM alm[KWG * MWG/VWM]; #endif #if SB == 1 __local realN blm[KWG * NWG/VWN]; #endif // Computes the matrix-multiplication and stores the result in global memory #if SA == 1 && SB == 1 XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, alpha, beta, alm, blm); #elif SA == 1 XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, alpha, beta, alm); #elif SB == 1 XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, alpha, beta, blm); #else XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, alpha, beta); #endif } #endif // ================================================================================================= #if defined(ROUTINE_GEMMSTRIDEDBATCHED) #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) #endif void XgemmStridedBatched(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realM* restrict agm, const int a_one, const int a_two, const __global realN* restrict bgm, const int b_one, const int b_two, __global realM* cgm, const int c_one, const int c_two) { const int batch = get_group_id(2); const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); // Sets the offsets const int a_offset = batch * a_one * a_two; const int b_offset = batch * b_one * b_two; const int c_offset = batch * c_one * c_two; const __global realM* restrict agm_ = &agm[a_offset / VWM]; const __global realN* restrict bgm_ = &bgm[b_offset / VWN]; __global realM* restrict cgm_ = &cgm[c_offset / VWM]; // Allocates workgroup-private memory (local memory) #if SA == 1 __local realM alm[KWG * MWG/VWM]; #endif #if SB == 1 __local realN blm[KWG * NWG/VWN]; #endif // Computes the matrix-multiplication and stores the result in global memory #if SA == 1 && SB == 1 XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, alpha, beta, alm, blm); #elif SA == 1 XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, alpha, beta, alm); #elif SB == 1 XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, alpha, beta, blm); #else XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, alpha, beta); #endif } #endif // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/xgemm_direct_batched.opencl000066400000000000000000000277501463263031500237720ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the batched version of the direct GEMM kernels. See part 1 for information // about the non-batched version of the kernel. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= #if defined(ROUTINE_GEMMBATCHED) // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld, __global real* cgm, const __constant int* c_offsets, const int c_ld, const int c_transpose, const int a_conjugate, const int b_conjugate) { const int batch = get_group_id(2); const real_arg arg_alpha = arg_alphas[batch]; const real_arg arg_beta = arg_betas[batch]; const int a_offset = a_offsets[batch]; const int b_offset = b_offsets[batch]; const int c_offset = c_offsets[batch]; __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate); } // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld, __global real* cgm, const __constant int* c_offsets, const int c_ld, const int c_transpose, const int a_conjugate, const int b_conjugate) { const int batch = get_group_id(2); const real_arg arg_alpha = arg_alphas[batch]; const real_arg arg_beta = arg_betas[batch]; const int a_offset = a_offsets[batch]; const int b_offset = b_offsets[batch]; const int c_offset = c_offsets[batch]; __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate); } // Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld, __global real* cgm, const __constant int* c_offsets, const int c_ld, const int c_transpose, const int a_conjugate, const int b_conjugate) { const int batch = get_group_id(2); const real_arg arg_alpha = arg_alphas[batch]; const real_arg arg_beta = arg_betas[batch]; const int a_offset = a_offsets[batch]; const int b_offset = b_offsets[batch]; const int c_offset = c_offsets[batch]; __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate); } // Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld, __global real* cgm, const __constant int* c_offsets, const int c_ld, const int c_transpose, const int a_conjugate, const int b_conjugate) { const int batch = get_group_id(2); const real_arg arg_alpha = arg_alphas[batch]; const real_arg arg_beta = arg_betas[batch]; const int a_offset = a_offsets[batch]; const int b_offset = b_offsets[batch]; const int c_offset = c_offsets[batch]; __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate); } #endif // ================================================================================================= #if defined(ROUTINE_GEMMSTRIDEDBATCHED) // Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, non-transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride, const __global realND* restrict bgm, const int b_offset, const int b_ld, const int b_stride, __global real* cgm, const int c_offset, const int c_ld, const int c_stride, const int c_transpose, const int a_conjugate, const int b_conjugate) { const int batch = get_group_id(2); const int a_offset_batch = a_offset + a_stride * batch; const int b_offset_batch = b_offset + b_stride * batch; const int c_offset_batch = c_offset + c_stride * batch; __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, agm, a_offset_batch, a_ld, bgm, b_offset_batch, b_ld, cgm, c_offset_batch, c_ld, alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate); } // Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride, const __global realND* restrict bgm, const int b_offset, const int b_ld, const int b_stride, __global real* cgm, const int c_offset, const int c_ld, const int c_stride, const int c_transpose, const int a_conjugate, const int b_conjugate) { const int batch = get_group_id(2); const int a_offset_batch = a_offset + a_stride * batch; const int b_offset_batch = b_offset + b_stride * batch; const int c_offset_batch = c_offset + c_stride * batch; __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, agm, a_offset_batch, a_ld, bgm, b_offset_batch, b_ld, cgm, c_offset_batch, c_ld, alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate); } // Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, non-transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride, const __global realND* restrict bgm, const int b_offset, const int b_ld, const int b_stride, __global real* cgm, const int c_offset, const int c_ld, const int c_stride, const int c_transpose, const int a_conjugate, const int b_conjugate) { const int batch = get_group_id(2); const int a_offset_batch = a_offset + a_stride * batch; const int b_offset_batch = b_offset + b_stride * batch; const int c_offset_batch = c_offset + c_stride * batch; __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, agm, a_offset_batch, a_ld, bgm, b_offset_batch, b_ld, cgm, c_offset_batch, c_ld, alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate); } // Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectStridedBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride, const __global realND* restrict bgm, const int b_offset, const int b_ld, const int b_stride, __global real* cgm, const int c_offset, const int c_ld, const int c_stride, const int c_transpose, const int a_conjugate, const int b_conjugate) { const int batch = get_group_id(2); const int a_offset_batch = a_offset + a_stride * batch; const int b_offset_batch = b_offset + b_stride * batch; const int c_offset_batch = c_offset + c_stride * batch; __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, agm, a_offset_batch, a_ld, bgm, b_offset_batch, b_ld, cgm, c_offset_batch, c_ld, alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate); } #endif // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/xgemm_direct_part1.opencl000066400000000000000000000225661463263031500234270ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any // pre and and post-processing kernels. // // This kernel is seperated into three files. This is part 1 out of 3. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. Note that all parameters here have a // suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel. #ifndef WGD #define WGD 8 // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64) #endif #ifndef MDIMCD #define MDIMCD 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) #endif #ifndef NDIMCD #define NDIMCD 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) #endif #ifndef MDIMAD #define MDIMAD 8 // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD #endif #ifndef NDIMBD #define NDIMBD 8 // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD #endif #ifndef KWID #define KWID 1 // Unroll factor of the WGD loop (smaller or equal than WGD) #endif #ifndef VWMD #define VWMD 1 // Vector width of matrices A and C #endif #ifndef VWND #define VWND 1 // Vector width of matrix B #endif #ifndef PADA #define PADA 1 // Local memory padding for matrix A #endif #ifndef PADB #define PADB 1 // Local memory padding for matrix B #endif // Helper parameters based on the above tuning parameters #define MWID (WGD/MDIMCD) // Work per work-item (M-dimension) #define NWID (WGD/NDIMCD) // Work per work-item (N-dimension) #define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD #define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD #define MWAD (WGD/MDIMAD) // Amount of loads-per-thread for matrix A (M-dimension) #define KWAD (WGD/KDIMAD) // Amount of loads-per-thread for matrix A (K-dimension) #define KWBD (WGD/KDIMBD) // Amount of loads-per-thread for matrix B (K-dimension) #define NWBD (WGD/NDIMBD) // Amount of loads-per-thread for matrix B (N-dimension) // ================================================================================================= // Data-widths in dimension M #if VWMD == 1 typedef real realMD; #elif VWMD == 2 typedef real2 realMD; #elif VWMD == 4 typedef real4 realMD; #elif VWMD == 8 typedef real8 realMD; #elif VWMD == 16 typedef real16 realMD; #endif // Data-widths in dimension N #if VWND == 1 typedef real realND; #elif VWND == 2 typedef real2 realND; #elif VWND == 4 typedef real4 realND; #elif VWND == 8 typedef real8 realND; #elif VWND == 16 typedef real16 realND; #endif // ================================================================================================= // Loads global off-chip memory into thread-private register files. This function is specific for // loading the A input matrix. INLINE_FUNC real GlobalToPrivateDirectA(const __global real* restrict agms, const int _mi, const int a_ld, const int a_offset, const int idm, const int idk, const int a_transpose, const int a_conjugate) { const int a_index = (a_transpose) ? (idm + _mi)*a_ld + idk : idk*a_ld + (idm + _mi); real result = agms[a_index + a_offset]; if (a_conjugate) { COMPLEX_CONJUGATE(result); } return result; } // Same as above, but now for the B input matrix INLINE_FUNC real GlobalToPrivateDirectB(const __global real* restrict bgms, const int _ni, const int b_ld, const int b_offset, const int idn, const int idk, const int b_transpose, const int b_conjugate) { const int b_index = (b_transpose) ? (idn + _ni)*b_ld + idk : idk*b_ld + (idn + _ni); real result = bgms[b_index + b_offset]; if (b_conjugate) { COMPLEX_CONJUGATE(result); } return result; } // Loads global off-chip memory into thread-private register files. This function is specific for // loading the A input matrix. This is the same as above but now includes a bounds check. INLINE_FUNC real GlobalToPrivateCheckedA(const __global real* restrict agms, const int _mi, const int a_ld, const int a_offset, const int idm, const int idk, const int a_transpose, const int a_conjugate, const int kSizeM) { real result; if (idm + _mi < kSizeM) { const int a_index = (a_transpose) ? (idm + _mi)*a_ld + idk : idk*a_ld + (idm + _mi); result = agms[a_index + a_offset]; if (a_conjugate) { COMPLEX_CONJUGATE(result); } } else { SetToZero(result); } return result; } // Same as above, but now for the B input matrix INLINE_FUNC real GlobalToPrivateCheckedB(const __global real* restrict bgms, const int _ni, const int b_ld, const int b_offset, const int idn, const int idk, const int b_transpose, const int b_conjugate, const int kSizeN) { real result; if (idn + _ni < kSizeN) { const int b_index = (b_transpose) ? (idn + _ni)*b_ld + idk : idk*b_ld + (idn + _ni); result = bgms[b_index + b_offset]; if (b_conjugate) { COMPLEX_CONJUGATE(result); } } else { SetToZero(result); } return result; } // ================================================================================================= // Caches on-chip local memory into per-thread private memory (registers). This function is specific // for caching the A input matrix. INLINE_FUNC real LocalToPrivateDirectA(LOCAL_PTR real* alm, const int _mi, const int kg, const int a_transpose) { const int mg = _mi + get_local_id(0)*MWID; const int index = (a_transpose) ? mg*(WGD + PADA) + kg : kg*(WGD + PADA) + mg; return alm[index]; } // Same as above, but now for the B input matrix INLINE_FUNC real LocalToPrivateDirectB(LOCAL_PTR real* blm, const int _ni, const int kg, const int b_transpose) { const int ng = _ni + get_local_id(1)*NWID; const int index = (b_transpose) ? ng*(WGD + PADB) + kg : kg*(WGD + PADB) + ng; return blm[index]; } // ================================================================================================= // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm INLINE_FUNC void StoreResultsDirect(__global real* cgm, const real c_value, const int _mi, const int _ni, const int idm, const int idn, const real alpha, const real beta, const int c_ld, const int c_offset, const int c_transpose) { // Determines the destination index int c_index = (c_transpose) ? (idm + _mi)*c_ld + (idn + _ni) : (idn + _ni)*c_ld + (idm + _mi); // The final multiplication with alpha (in case beta == 0) real result; if (IsZero(beta)) { Multiply(result, alpha, c_value); } // The final multiplication with alpha and the addition with beta*C else { AXPBY(result, alpha, c_value, beta, cgm[c_index + c_offset]); } cgm[c_index + c_offset] = result; } // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm INLINE_FUNC void StoreResultsChecked(__global real* cgm, const real c_value, const int _mi, const int _ni, const int idm, const int idn, const int kSizeM, const int kSizeN, const real alpha, const real beta, const int c_ld, const int c_offset, const int c_transpose) { if ((idm + _mi) < kSizeM && (idn + _ni) < kSizeN) { // Deter_mines the destination index int c_index = (c_transpose) ? (idm + _mi)*c_ld + (idn + _ni) : (idn + _ni)*c_ld + (idm + _mi); // The final multiplication with alpha (in case beta == 0) real result; if (IsZero(beta)) { Multiply(result, alpha, c_value); } // The final multiplication with alpha and the addition with beta*C else { AXPBY(result, alpha, c_value, beta, cgm[c_index + c_offset]); } cgm[c_index + c_offset] = result; } } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/xgemm_direct_part2.opencl000066400000000000000000000320151463263031500234160ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This is part 2 of 3 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, LOCAL_PTR real* alm, const int a_ld, const int a_offset, const int kwg, const int a_transpose, const int a_conjugate) { #if MDIMCD == MDIMAD const int la0 = get_local_id(0); const int la1 = get_local_id(1); #else const int tid = get_local_id(0) + MDIMCD*get_local_id(1); const int la0 = tid % MDIMAD; const int la1 = tid / MDIMAD; #endif #pragma unroll for (int _mia = 0; _mia < MWAD/VWMD; _mia += 1) { #pragma unroll for (int _kia = 0; _kia < KWAD; _kia += 1) { // Computes the indices for the global memory int mg = _mia + la0*(MWAD/VWMD); int kg = _kia + la1*KWAD; int idm = (a_transpose) ? mg + kwg/VWMD : mg + GetGroupID0()*(WGD/VWMD); int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg; // Loads the data from global memory into the local memory const realMD avec = agm[idk*(a_ld/VWMD) + idm + (a_offset/VWMD)]; #if VWMD == 1 alm[kg*(WGD + PADA) + mg] = avec; #elif VWMD == 2 alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x; alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y; #elif VWMD == 4 alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x; alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y; alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.z; alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.w; #elif VWMD == 8 alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0; alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1; alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2; alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3; alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4; alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5; alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6; alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7; #elif VWMD == 16 alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0; alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1; alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2; alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3; alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4; alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5; alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6; alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7; alm[kg*(WGD + PADA) + mg*VWMD + 8] = avec.s8; alm[kg*(WGD + PADA) + mg*VWMD + 9] = avec.s9; alm[kg*(WGD + PADA) + mg*VWMD + 10] = avec.sA; alm[kg*(WGD + PADA) + mg*VWMD + 11] = avec.sB; alm[kg*(WGD + PADA) + mg*VWMD + 12] = avec.sC; alm[kg*(WGD + PADA) + mg*VWMD + 13] = avec.sD; alm[kg*(WGD + PADA) + mg*VWMD + 14] = avec.sE; alm[kg*(WGD + PADA) + mg*VWMD + 15] = avec.sF; #endif if (a_conjugate) { for (int vm=0; vm // // This is part 3 of 3 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Main body of the kernel. This is the direct version without pre/post processing and restrictions. INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const __global realND* restrict bgm, const int b_offset, const int b_ld, __global real* cgm, const int c_offset, const int c_ld, LOCAL_PTR real* alm, LOCAL_PTR real* blm, const int a_transpose, const int b_transpose, const int c_transpose, const int a_conjugate, const int b_conjugate) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); // Extra pointers to scalar versions of global memory const __global real* restrict agms = (const __global real* restrict) agm; const __global real* restrict bgms = (const __global real* restrict) bgm; // Allocates workitem-private memory (registers) #pragma promote_to_registers real apd[MWID]; #pragma promote_to_registers real bpd[NWID]; #pragma promote_to_registers real cpd[NWID * MWID]; // Initializes the accumulation registers #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { SetToZero(cpd[_ni * MWID + _mi]); } } // The faster version of GEMM is not allowed on the (incomplete) borders. Therefore, this section // processes only the main parts: output blocks of WGD by WGD. const int idm = get_local_id(0) * MWID + GetGroupID0() * WGD; const int idn = get_local_id(1) * NWID + GetGroupID1() * WGD; if ((idm < (kSizeM/WGD)*WGD) && (idn < (kSizeN/WGD)*WGD)) { // Loops over all complete workgroup tiles (K-dimension) int kwg = 0; for (; kwg < (kSizeK/WGD) * WGD; kwg += WGD) { // Loads data: off-chip --> local (matrix A and B) if (a_ld % VWMD == 0 && a_offset % VWMD == 0) { GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); } else { GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); } if (b_ld % VWND == 0 && b_offset % VWND == 0) { GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); } else { GlobalToLocalScalarB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); } barrier(CLK_LOCAL_MEM_FENCE); // Loops over all workitem tiles, unrolled by a factor KWID for (int pwi = 0; pwi < WGD; pwi += KWID) { #pragma unroll for (int _pit = 0; _pit < KWID; _pit += 1) { int kg = pwi + _pit; // Loads data: local --> private (matrix A and B) #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { apd[_mi] = LocalToPrivateDirectA(alm, _mi, kg, a_transpose); } #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { bpd[_ni] = LocalToPrivateDirectB(blm, _ni, kg, b_transpose); } // Performs the accumulation (Cpmd += Apmd * Bpmd) #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); } } } } barrier(CLK_LOCAL_MEM_FENCE); } // Loop over the remaining part (incomplete tile in K-dimension) for (; kwg < kSizeK; ++kwg) { // Loads data: off-chip --> private (matrix A and B) #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { apd[_mi] = GlobalToPrivateDirectA(agms, _mi, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate); } #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { bpd[_ni] = GlobalToPrivateDirectB(bgms, _ni, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate); } // Performs the accumulation (Cpmd += Apmd * Bpmd) #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); } } } // Stores a tile of results and performs the multiplication with alpha and beta #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { StoreResultsDirect(cgm, cpd[_ni * MWID + _mi], _mi, _ni, idm, idn, alpha, beta, c_ld, c_offset, c_transpose); } } } // Simple but slower version for the parts on the edge (incomplete tiles in M and N-dimensions) else { // Loops over all complete workgroup tiles (K-dimension) int kwg = 0; for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) { // Loads data: off-chip --> local (matrix A and B) GlobalToLocalCheckedA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate, kSizeM, kSizeK); GlobalToLocalCheckedB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate, kSizeN, kSizeK); barrier(CLK_LOCAL_MEM_FENCE); // Loops over all workitem tiles, unrolled by a factor KWID for (int pwi = 0; pwi < WGD; pwi += KWID) { #pragma unroll for (int _pit = 0; _pit < KWID; _pit += 1) { int kg = pwi + _pit; // Loads data: local --> private (matrix A and B) #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { apd[_mi] = LocalToPrivateDirectA(alm, _mi, kg, a_transpose); } #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { bpd[_ni] = LocalToPrivateDirectB(blm, _ni, kg, b_transpose); } // Performs the accumulation (C += A * B) #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); } } } } barrier(CLK_LOCAL_MEM_FENCE); } // Loop over the remaining part (incomplete tile in K-dimension) for (; kwg < kSizeK; ++kwg) { // Loads data: off-chip --> private (matrix A and B) #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { apd[_mi] = GlobalToPrivateCheckedA(agms, _mi, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate, kSizeM); } #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { bpd[_ni] = GlobalToPrivateCheckedB(bgms, _ni, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate, kSizeN); } // Performs the accumulation (C += A * B) #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); } } } // Stores a tile of results and performs the multiplication with alpha and beta #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { StoreResultsChecked(cgm, cpd[_ni * MWID + _mi], _mi, _ni, idm, idn, kSizeM, kSizeN, alpha, beta, c_ld, c_offset, c_transpose); } } } } // ================================================================================================= // Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const __global realND* restrict bgm, const int b_offset, const int b_ld, __global real* cgm, const int c_offset, const int c_ld, const int c_transpose, const int a_conjugate, const int b_conjugate) { __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate); } // Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const __global realND* restrict bgm, const int b_offset, const int b_ld, __global real* cgm, const int c_offset, const int c_ld, const int c_transpose, const int a_conjugate, const int b_conjugate) { __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate); } // Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const __global realND* restrict bgm, const int b_offset, const int b_ld, __global real* cgm, const int c_offset, const int c_ld, const int c_transpose, const int a_conjugate, const int b_conjugate) { __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate); } // Direct version of the GEMM kernel with [A, B] = [transposed, transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const __global realND* restrict bgm, const int b_offset, const int b_ld, __global real* cgm, const int c_offset, const int c_ld, const int c_transpose, const int a_conjugate, const int b_conjugate) { __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate); } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/xgemm_part1.opencl000066400000000000000000000340411463263031500220640ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains two optimized matrix-multiplication kernels: // - Kernel 0: inspired by the paper by Matsumoto et al. and the tutorial on // http://www.cedricnugteren.nl/tutorial.php // - Kernel 1: inspired by a Qualcomm optimized GPU kernel with 2D register tiling // https://developer.qualcomm.com/blog/matrix-multiply-adreno-gpus-part-2-host-code-and-kernel // Both are fully configurable (and tunable!) using many parameters. Both kernels support // different data-types (SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM) through a pre-processor define. // // For kernel 0 matrices are accessed as follows: // A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m) // B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n) // C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m) // For kernel 1, both A and C are transposed w.r.t. the above // // Or as an image (assuming column-major) // K // o-------o // | | // N | [B^T] | // | | // o-------o // K N // o-------o o-----o // M | [A] | M | [C] | // | | | | // o-------o o-----o // // // This kernel is separated into multiple files. This is part 1 out of 4. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. #ifndef GEMMK #define GEMMK 0 // Kernel to choose: 0 regular, 1 with 2D register tiling #endif #ifndef MWG #define MWG 8 // Tile-size in dimension M (e.g. 64, 128) #endif #ifndef NWG #define NWG 8 // Tile-size in dimension N (e.g. 64, 128) #endif #ifndef KWG #define KWG 8 // Tile-size in dimension K (e.g. 8, 16) #endif #ifndef MDIMC #define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) #endif #ifndef NDIMC #define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) #endif #ifndef MDIMA #define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA (kernel 0 only) #endif #ifndef NDIMB #define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB (kernel 0 only) #endif #ifndef KWI #define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG) #endif #ifndef VWM #define VWM 1 // Vector width of matrices A and C #endif #ifndef VWN #define VWN 1 // Vector width of matrix B #endif #ifndef STRM #define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0) (kernel 0 only) #endif #ifndef STRN #define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0) (kernel 0 only) #endif #ifndef SA #define SA 0 // Use local/shared memory to cache matrix A (1) or not (0) (kernel 0 only) #endif #ifndef SB #define SB 0 // Use local/shared memory to cache matrix B (1) or not (0) (kernel 0 only) #endif #ifndef KREG #define KREG 1 // Amount of register tiling in second dimension, multiple of VWN (kernel 1 only) #endif // Helper parameters based on the above tuning parameters #define MWI (MWG/MDIMC) // Work per work-item (M-dimension) #define NWI (NWG/NDIMC) // Work per work-item (N-dimension) #define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA #define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB #define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension) #define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension) #define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension) #define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension) // Settings #ifndef USE_VECTOR_MAD #define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually #endif #ifndef GLOBAL_MEM_FENCE #define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance #endif #ifndef SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA #define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 0 #endif #ifndef SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA #define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 0 #endif #ifndef SUBGROUP_SHUFFLING_INTEL #define SUBGROUP_SHUFFLING_INTEL 0 #endif #ifndef USE_SUBGROUP_SHUFFLING #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs #endif // Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.html) #if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_INTEL == 1 #pragma OPENCL EXTENSION cl_intel_subgroups: enable #define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs #endif // NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html) #if USE_SUBGROUP_SHUFFLING == 1 #if SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1 #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs #endif #endif #if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE #undef USE_SUBGROUP_SHUFFLING #define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold #endif // ================================================================================================= // Data-widths in dimension M #if VWM == 1 typedef real realM; #elif VWM == 2 typedef real2 realM; #elif VWM == 4 typedef real4 realM; #elif VWM == 8 typedef real8 realM; #elif VWM == 16 typedef real16 realM; #endif // Data-widths in dimension N #if VWN == 1 typedef real realN; #elif VWN == 2 typedef real2 realN; #elif VWN == 4 typedef real4 realN; #elif VWN == 8 typedef real8 realN; #elif VWN == 16 typedef real16 realN; #endif // ================================================================================================= // Initializes the accumulation registers to zero INLINE_FUNC realM InitAccRegisters() { realM result; #if VWM == 1 SetToZero(result); #elif VWM == 2 SetToZero(result.x); SetToZero(result.y); #elif VWM == 4 SetToZero(result.x); SetToZero(result.y); SetToZero(result.z); SetToZero(result.w); #elif VWM == 8 SetToZero(result.s0); SetToZero(result.s1); SetToZero(result.s2); SetToZero(result.s3); SetToZero(result.s4); SetToZero(result.s5); SetToZero(result.s6); SetToZero(result.s7); #elif VWM == 16 SetToZero(result.s0); SetToZero(result.s1); SetToZero(result.s2); SetToZero(result.s3); SetToZero(result.s4); SetToZero(result.s5); SetToZero(result.s6); SetToZero(result.s7); SetToZero(result.s8); SetToZero(result.s9); SetToZero(result.sA); SetToZero(result.sB); SetToZero(result.sC); SetToZero(result.sD); SetToZero(result.sE); SetToZero(result.sF); #endif return result; } // ================================================================================================= // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. #if SA == 1 INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, LOCAL_PTR realM* alm, const int kSizeM, const int tid, const int kwg) { const int la0 = tid % MDIMA; const int la1 = tid / MDIMA; #pragma unroll for (int _mia = 0; _mia < MWA/VWM; _mia += 1) { #pragma unroll for (int _kia = 0; _kia < KWA; _kia += 1) { // Computes the indices based on strided/non-strided access #if STRM == 0 int mg = _mia + la0*(MWA/VWM); #elif STRM == 1 int mg = la0 + _mia*MDIMA; #endif // Computes the indices for the global memory int kg = _kia + la1*KWA; int idm = mg + GetGroupID0() * (MWG/VWM); int idk = kg + kwg; // Loads the data from global memory (not transposed) into the local memory alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm]; } } } #endif // Same as above, but now for the B input matrix #if SB == 1 INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, LOCAL_PTR realN* blm, const int kSizeN, const int tid, const int kwg) { const int lb0 = tid % NDIMB; const int lb1 = tid / NDIMB; #pragma unroll for (int _kib = 0; _kib < KWB; _kib += 1) { #pragma unroll for (int _nib = 0; _nib < NWB/VWN; _nib += 1) { // Computes the indices based on strided/non-strided access #if STRN == 0 int ng = _nib + lb0*(NWB/VWN); #elif STRN == 1 int ng = lb0 + _nib*NDIMB; #endif // Computes the indices for the global memory int kg = _kib + lb1*KWB; int idn = ng + GetGroupID1() * (NWG/VWN); int idk = kg + kwg; // Loads the data from global memory (transposed) into the local memory blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn]; } } } #endif // ================================================================================================= // Caches global off-chip memory directly into per-thread private memory (registers). This function // is specific for caching the A input matrix. #if SA == 0 && GEMMK == 0 INLINE_FUNC realM GlobalToPrivateA(const __global realM* restrict agm, const int _mi, const int kSizeM, const int idk, const int kwg) { // Computes the indices based on strided/non-strided access #if STRM == 0 int mg = _mi + get_local_id(0)*(MWI/VWM); #elif STRM == 1 int mg = get_local_id(0) + _mi*MDIMC; #endif // Computes the indices for the global memory int idm = mg + GetGroupID0() * (MWG/VWM); // Loads the data from global memory (not transposed) and stores into registers return agm[idk*(kSizeM/VWM) + idm]; } #endif // Same as above, but now for the B input matrix #if SB == 0 && GEMMK == 0 INLINE_FUNC realN GlobalToPrivateB(const __global realN* restrict bgm, const int _ni, const int kSizeN, const int idk) { // Computes the indices based on strided/non-strided access #if STRN == 0 int ng = _ni + get_local_id(1)*(NWI/VWN); #elif STRN == 1 int ng = get_local_id(1) + _ni*NDIMC; #endif // Computes the indices for the global memory int idn = ng + GetGroupID1() * (NWG/VWN); // Loads the data from global memory (transposed) and stores into registers return bgm[idk*(kSizeN/VWN) + idn]; } #endif // ================================================================================================= #if GEMMK == 1 // Caches global off-chip memory directly into per-thread private memory (registers). This function // is specific for caching the A input matrix for kernel 1. INLINE_FUNC realN GlobalToPrivateA2D(const __global real* restrict a_ptr, const int tid_y, const int _ni, const int kSizeK, const int idk, const int _ki) { #if PRECISION == 3232 || PRECISION == 6464 const int a_index = (tid_y * NWI + _ni) * (kSizeK / VWN) + idk / VWN + _ki; const __global realN* restrict agm = (const __global realN* restrict) a_ptr; return agm[a_index]; #else const int a_index = (tid_y * NWI + _ni) * kSizeK + idk + _ki * VWN; #if VWN == 1 return a_ptr[a_index]; #elif VWN == 2 return vload2(0, a_ptr + a_index); #elif VWN == 4 return vload4(0, a_ptr + a_index); #elif VWN == 8 return vload8(0, a_ptr + a_index); #elif VWN == 16 return vload16(0, a_ptr + a_index); #endif #endif } // Same as above, but now for the B input matrix INLINE_FUNC realM GlobalToPrivateB2D(const __global real* restrict b_ptr, const int tid_x, const int _mi, const int kSizeN, const int idk, const int _ki) { #if PRECISION == 3232 || PRECISION == 6464 const int b_index = (idk + _ki) * (kSizeN / VWM) + tid_x * (MWI / VWM) + _mi; const __global realM* restrict bgm = (const __global realM* restrict) b_ptr; return bgm[b_index]; #else const int b_index = (idk + _ki) * kSizeN + tid_x * MWI + _mi * VWM; #if VWM == 1 return b_ptr[b_index]; #elif VWM == 2 return vload2(0, b_ptr + b_index); #elif VWM == 4 return vload4(0, b_ptr + b_index); #elif VWM == 8 return vload8(0, b_ptr + b_index); #elif VWM == 16 return vload16(0, b_ptr + b_index); #endif #endif } #endif // ================================================================================================= // Caches on-chip local memory into per-thread private memory (registers). This function is specific // for caching the A input matrix. #if SA == 1 INLINE_FUNC realM LocalToPrivateA(LOCAL_PTR realM* alm, const int _mi, const int kg) { #if STRM == 0 int mg = _mi + get_local_id(0)*(MWI/VWM); #elif STRM == 1 int mg = get_local_id(0) + _mi*MDIMC; #endif return alm[kg*(MWG/VWM) + mg]; } #endif // Same as above, but now for the B input matrix #if SB == 1 INLINE_FUNC realN LocalToPrivateB(LOCAL_PTR realN* blm, const int _ni, const int kg) { #if STRN == 0 int ng = _ni + get_local_id(1)*(NWI/VWN); #elif STRN == 1 int ng = get_local_id(1) + _ni*NDIMC; #endif return blm[kg*(NWG/VWN) + ng]; } #endif )" // End of the C++11 raw string literal // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/xgemm_part2.opencl000066400000000000000000000153371463263031500220740ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This is part 2 of 4 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // The vectorised multiply-add function INLINE_FUNC realM MultiplyAddVector(realM cvec, const realM avec, const real bval) { #if USE_VECTOR_MAD == 1 cvec += avec * bval; #else #if VWM == 1 MultiplyAdd(cvec, avec, bval); #elif VWM == 2 MultiplyAdd(cvec.x , avec.x, bval); MultiplyAdd(cvec.y , avec.y, bval); #elif VWM == 4 MultiplyAdd(cvec.x , avec.x, bval); MultiplyAdd(cvec.y , avec.y, bval); MultiplyAdd(cvec.z , avec.z, bval); MultiplyAdd(cvec.w , avec.w, bval); #elif VWM == 8 MultiplyAdd(cvec.s0, avec.s0, bval); MultiplyAdd(cvec.s1, avec.s1, bval); MultiplyAdd(cvec.s2, avec.s2, bval); MultiplyAdd(cvec.s3, avec.s3, bval); MultiplyAdd(cvec.s4, avec.s4, bval); MultiplyAdd(cvec.s5, avec.s5, bval); MultiplyAdd(cvec.s6, avec.s6, bval); MultiplyAdd(cvec.s7, avec.s7, bval); #elif VWM == 16 MultiplyAdd(cvec.s0, avec.s0, bval); MultiplyAdd(cvec.s1, avec.s1, bval); MultiplyAdd(cvec.s2, avec.s2, bval); MultiplyAdd(cvec.s3, avec.s3, bval); MultiplyAdd(cvec.s4, avec.s4, bval); MultiplyAdd(cvec.s5, avec.s5, bval); MultiplyAdd(cvec.s6, avec.s6, bval); MultiplyAdd(cvec.s7, avec.s7, bval); MultiplyAdd(cvec.s8, avec.s8, bval); MultiplyAdd(cvec.s9, avec.s9, bval); MultiplyAdd(cvec.sA, avec.sA, bval); MultiplyAdd(cvec.sB, avec.sB, bval); MultiplyAdd(cvec.sC, avec.sC, bval); MultiplyAdd(cvec.sD, avec.sD, bval); MultiplyAdd(cvec.sE, avec.sE, bval); MultiplyAdd(cvec.sF, avec.sF, bval); #endif #endif return cvec; } // ================================================================================================= // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm INLINE_FUNC void StoreResults(__global realM* cgm, realM c_value, const int _mi, const int _ni, const int kSizeM, const real alpha, const real beta) { #if STRM == 0 int mg = _mi + get_local_id(0)*(MWI/VWM); #elif STRM == 1 int mg = get_local_id(0) + _mi*MDIMC; #endif #if STRN == 0 int ng = _ni + get_local_id(1)*NWI; #elif STRN == 1 int ng = _ni%VWN + get_local_id(1)*VWN + (_ni/VWN)*VWN*NDIMC; #endif int idm = mg + GetGroupID0() * (MWG/VWM); int idn = ng + GetGroupID1() * NWG; int index = idn*(kSizeM/VWM) + idm; realM result; realM xval = c_value; // The final multiplication with alpha (in case beta == 0) if (IsZero(beta)) { #if VWM == 1 Multiply(result, alpha, xval); #elif VWM == 2 Multiply(result.x, alpha, xval.x); Multiply(result.y, alpha, xval.y); #elif VWM == 4 Multiply(result.x, alpha, xval.x); Multiply(result.y, alpha, xval.y); Multiply(result.z, alpha, xval.z); Multiply(result.w, alpha, xval.w); #elif VWM == 8 Multiply(result.s0, alpha, xval.s0); Multiply(result.s1, alpha, xval.s1); Multiply(result.s2, alpha, xval.s2); Multiply(result.s3, alpha, xval.s3); Multiply(result.s4, alpha, xval.s4); Multiply(result.s5, alpha, xval.s5); Multiply(result.s6, alpha, xval.s6); Multiply(result.s7, alpha, xval.s7); #elif VWM == 16 Multiply(result.s0, alpha, xval.s0); Multiply(result.s1, alpha, xval.s1); Multiply(result.s2, alpha, xval.s2); Multiply(result.s3, alpha, xval.s3); Multiply(result.s4, alpha, xval.s4); Multiply(result.s5, alpha, xval.s5); Multiply(result.s6, alpha, xval.s6); Multiply(result.s7, alpha, xval.s7); Multiply(result.s8, alpha, xval.s8); Multiply(result.s9, alpha, xval.s9); Multiply(result.sA, alpha, xval.sA); Multiply(result.sB, alpha, xval.sB); Multiply(result.sC, alpha, xval.sC); Multiply(result.sD, alpha, xval.sD); Multiply(result.sE, alpha, xval.sE); Multiply(result.sF, alpha, xval.sF); #endif } // The final multiplication with alpha and the addition with beta*C else { realM yval = cgm[index]; #if VWM == 1 AXPBY(result, alpha, xval, beta, yval); #elif VWM == 2 AXPBY(result.x, alpha, xval.x, beta, yval.x); AXPBY(result.y, alpha, xval.y, beta, yval.y); #elif VWM == 4 AXPBY(result.x, alpha, xval.x, beta, yval.x); AXPBY(result.y, alpha, xval.y, beta, yval.y); AXPBY(result.z, alpha, xval.z, beta, yval.z); AXPBY(result.w, alpha, xval.w, beta, yval.w); #elif VWM == 8 AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); #elif VWM == 16 AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); AXPBY(result.s8, alpha, xval.s8, beta, yval.s8); AXPBY(result.s9, alpha, xval.s9, beta, yval.s9); AXPBY(result.sA, alpha, xval.sA, beta, yval.sA); AXPBY(result.sB, alpha, xval.sB, beta, yval.sB); AXPBY(result.sC, alpha, xval.sC, beta, yval.sC); AXPBY(result.sD, alpha, xval.sD, beta, yval.sD); AXPBY(result.sE, alpha, xval.sE, beta, yval.sE); AXPBY(result.sF, alpha, xval.sF, beta, yval.sF); #endif } cgm[index] = result; } )" // End of the C++11 raw string literal // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/xgemm_part3.opencl000066400000000000000000000413521463263031500220710ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This is part 3 of 4 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // A common interface for subgroup functions #if USE_SUBGROUP_SHUFFLING == 1 INLINE_FUNC int clblast_get_sub_group_local_id() { // Intel extension #if SUBGROUP_SHUFFLING_INTEL == 1 return get_sub_group_local_id(); // Nvidia inline PTX #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1 int ret; asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) ); return ret; #endif } INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) { // Intel extension #if SUBGROUP_SHUFFLING_INTEL == 1 return intel_sub_group_shuffle(reg, src); // Nvidia inline PTX // Volta and later requires .sync shuffle instructions with an extra mask arg #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1 realN ret; #if SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1 asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src)); #else asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src)); #endif return ret; #endif } #endif // Main body of the matrix-multiplication algorithm. It calls various (inlined) functions. INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm, const real alpha, const real beta #if SA == 1 && SB == 1 , LOCAL_PTR realM* alm, LOCAL_PTR realN* blm #elif SA == 1 , LOCAL_PTR realM* alm #elif SB == 1 , LOCAL_PTR realN* blm #endif ) { // Allocates workitem-private memory (registers) #if GEMMK == 0 #pragma promote_to_registers realM apm[MWI/VWM]; // MWI * 1 #pragma promote_to_registers realN bpm[NWI/VWN]; // 1 * NWI #elif GEMMK == 1 #if USE_SUBGROUP_SHUFFLING == 1 #pragma promote_to_registers realN apm[KREG/VWN]; // KREG (subgroup shuffling in NWI dimension) #else #pragma promote_to_registers realN apm[NWI*(KREG/VWN)]; // NWI * KREG #endif #pragma promote_to_registers realM bpm[KREG*(MWI/VWM)]; // KREG * MWI #endif #pragma promote_to_registers realM cpm[NWI*(MWI/VWM)]; // NWI * MWI #if GEMMK == 1 const __global real* restrict a_ptr = (const __global real* restrict) &agm[0]; const __global real* restrict b_ptr = (const __global real* restrict) &bgm[0]; const int tid_x = get_local_id(0) + MDIMC * GetGroupID0(); const int tid_y = get_local_id(1) + NDIMC * GetGroupID1(); #endif // Combined thread identifier (volatile to disable caching) #if SA == 1 || SB == 1 volatile int tid = get_local_id(0) + MDIMC*get_local_id(1); #endif // Initializes the accumulation registers #pragma unroll for (int _mi = 0; _mi < MWI/VWM; _mi += 1) { #pragma unroll for (int _ni = 0; _ni < NWI; _ni += 1) { cpm[_ni * (MWI/VWM) + _mi] = InitAccRegisters(); } } // Loops over all workgroup tiles for (int kwg = 0; kwg < kSizeK; kwg += KWG * KREG) { // Loads data: off-chip --> local (matrix A) #if SA == 1 GlobalToLocalA(agm, alm, kSizeM, tid, kwg); #endif // Loads data: off-chip --> local (matrix B) #if SB == 1 GlobalToLocalB(bgm, blm, kSizeN, tid, kwg); #endif #if SA == 1 || SB == 1 barrier(CLK_LOCAL_MEM_FENCE); #endif // Loops over all workitem tiles, unrolled by a factor KWI for (int pwi = 0; pwi < KWG * KREG; pwi += KWI * KREG) { #pragma unroll for (int _pit = 0; _pit < KWI*KREG; _pit += KREG) { #if SA == 0 || SB == 0 int idk = kwg + pwi + _pit; #endif #if SA == 1 || SB == 1 int kg = pwi + _pit; #endif // Loads matrix A (kernel 0) or matrix B (kernel 1) #pragma unroll for (int _mi = 0; _mi < MWI/VWM; _mi += 1) { // Loads data: local --> private (matrix A) #if GEMMK == 0 && SA == 1 apm[_mi] = LocalToPrivateA(alm, _mi, kg); // Loads data: off-chip --> private (matrix A) #elif GEMMK == 0 && SA == 0 apm[_mi] = GlobalToPrivateA(agm, _mi, kSizeM, idk, kwg); // Loads data: 2D global --> 2D private (matrix B) #elif GEMMK == 1 #pragma unroll for (int _ki = 0; _ki < KREG; _ki += 1) { bpm[_ki * (MWI/VWM) + _mi] = GlobalToPrivateB2D(b_ptr, tid_x, _mi, kSizeN, idk, _ki); } #endif } // Loads matrix B (kernel 0) or matrix A (kernel 1) #if GEMMK == 0 #pragma unroll for (int _ni = 0; _ni < NWI/VWN; _ni += 1) { // Loads data: local --> private (matrix B) #if SB == 1 bpm[_ni] = LocalToPrivateB(blm, _ni, kg); // Loads data: off-chip --> private (matrix B) #else bpm[_ni] = GlobalToPrivateB(bgm, _ni, kSizeN, idk); #endif } #elif GEMMK == 1 // Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups #if USE_SUBGROUP_SHUFFLING == 1 const int _ni = clblast_get_sub_group_local_id(); #pragma unroll for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki); } // Loads data: 2D global --> 2D private (matrix A) #else #pragma unroll for (int _ni = 0; _ni < NWI; _ni += 1) { #pragma unroll for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { apm[_ni * (KREG/VWN) + _ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki); } } #endif #endif // Performs the accumulation (Cpm += Apm * Bpm) #if GEMMK == 0 #pragma unroll for (int _ni = 0; _ni < NWI/VWN; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWI/VWM; _mi += 1) { const realM aval = apm[_mi]; #if VWN == 1 cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi], aval, bpm[_ni]); #elif VWN == 2 cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi], aval, bpm[_ni].x); cpm[(_ni*VWN + 1)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 1)*(MWI/VWM) + _mi], aval, bpm[_ni].y); #elif VWN == 4 cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi], aval, bpm[_ni].x); cpm[(_ni*VWN + 1)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 1)*(MWI/VWM) + _mi], aval, bpm[_ni].y); cpm[(_ni*VWN + 2)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 2)*(MWI/VWM) + _mi], aval, bpm[_ni].z); cpm[(_ni*VWN + 3)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 3)*(MWI/VWM) + _mi], aval, bpm[_ni].w); #elif VWN == 8 cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi], aval, bpm[_ni].s0); cpm[(_ni*VWN + 1)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 1)*(MWI/VWM) + _mi], aval, bpm[_ni].s1); cpm[(_ni*VWN + 2)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 2)*(MWI/VWM) + _mi], aval, bpm[_ni].s2); cpm[(_ni*VWN + 3)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 3)*(MWI/VWM) + _mi], aval, bpm[_ni].s3); cpm[(_ni*VWN + 4)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 4)*(MWI/VWM) + _mi], aval, bpm[_ni].s4); cpm[(_ni*VWN + 5)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 5)*(MWI/VWM) + _mi], aval, bpm[_ni].s5); cpm[(_ni*VWN + 6)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 6)*(MWI/VWM) + _mi], aval, bpm[_ni].s6); cpm[(_ni*VWN + 7)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 7)*(MWI/VWM) + _mi], aval, bpm[_ni].s7); #elif VWN == 16 cpm[(_ni*VWN + 0 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 0 )*(MWI/VWM) + _mi], aval, bpm[_ni].s0); cpm[(_ni*VWN + 1 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 1 )*(MWI/VWM) + _mi], aval, bpm[_ni].s1); cpm[(_ni*VWN + 2 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 2 )*(MWI/VWM) + _mi], aval, bpm[_ni].s2); cpm[(_ni*VWN + 3 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 3 )*(MWI/VWM) + _mi], aval, bpm[_ni].s3); cpm[(_ni*VWN + 4 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 4 )*(MWI/VWM) + _mi], aval, bpm[_ni].s4); cpm[(_ni*VWN + 5 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 5 )*(MWI/VWM) + _mi], aval, bpm[_ni].s5); cpm[(_ni*VWN + 6 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 6 )*(MWI/VWM) + _mi], aval, bpm[_ni].s6); cpm[(_ni*VWN + 7 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 7 )*(MWI/VWM) + _mi], aval, bpm[_ni].s7); cpm[(_ni*VWN + 8 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 8 )*(MWI/VWM) + _mi], aval, bpm[_ni].s8); cpm[(_ni*VWN + 9 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 9 )*(MWI/VWM) + _mi], aval, bpm[_ni].s9); cpm[(_ni*VWN + 10)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 10)*(MWI/VWM) + _mi], aval, bpm[_ni].sA); cpm[(_ni*VWN + 11)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 11)*(MWI/VWM) + _mi], aval, bpm[_ni].sB); cpm[(_ni*VWN + 12)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 12)*(MWI/VWM) + _mi], aval, bpm[_ni].sC); cpm[(_ni*VWN + 13)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 13)*(MWI/VWM) + _mi], aval, bpm[_ni].sD); cpm[(_ni*VWN + 14)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 14)*(MWI/VWM) + _mi], aval, bpm[_ni].sE); cpm[(_ni*VWN + 15)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 15)*(MWI/VWM) + _mi], aval, bpm[_ni].sF); #endif } } #elif GEMMK == 1 #pragma unroll for (int _ni = 0; _ni < NWI; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWI/VWM; _mi += 1) { #pragma unroll for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { #if USE_SUBGROUP_SHUFFLING == 1 const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni); #else const realN aval = apm[_ni * (KREG/VWN) + _ki]; #endif #if VWN == 1 cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval); #elif VWN == 2 cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y); #elif VWN == 4 cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.z); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.w); #elif VWN == 8 cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.s0); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.s1); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.s2); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.s3); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 4) * (MWI/VWM) + _mi], aval.s4); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 5) * (MWI/VWM) + _mi], aval.s5); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 6) * (MWI/VWM) + _mi], aval.s6); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 7) * (MWI/VWM) + _mi], aval.s7); #elif VWN == 16 cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0 ) * (MWI/VWM) + _mi], aval.s0); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1 ) * (MWI/VWM) + _mi], aval.s1); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2 ) * (MWI/VWM) + _mi], aval.s2); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3 ) * (MWI/VWM) + _mi], aval.s3); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 4 ) * (MWI/VWM) + _mi], aval.s4); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 5 ) * (MWI/VWM) + _mi], aval.s5); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 6 ) * (MWI/VWM) + _mi], aval.s6); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 7 ) * (MWI/VWM) + _mi], aval.s7); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 8 ) * (MWI/VWM) + _mi], aval.s8); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 9 ) * (MWI/VWM) + _mi], aval.s9); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 10) * (MWI/VWM) + _mi], aval.sA); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 11) * (MWI/VWM) + _mi], aval.sB); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 12) * (MWI/VWM) + _mi], aval.sC); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 13) * (MWI/VWM) + _mi], aval.sD); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 14) * (MWI/VWM) + _mi], aval.sE); cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 15) * (MWI/VWM) + _mi], aval.sF); #endif } } } #endif } } #if SA == 1 || SB == 1 barrier(CLK_LOCAL_MEM_FENCE); #endif } #if GLOBAL_MEM_FENCE == 1 barrier(CLK_GLOBAL_MEM_FENCE); #endif // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta #if GEMMK == 0 const int cld = kSizeM; #elif GEMMK == 1 const int cld = kSizeN; #endif #pragma unroll for (int _ni = 0; _ni < NWI; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWI/VWM; _mi += 1) { StoreResults(cgm, cpm[_ni * (MWI/VWM) + _mi], _mi, _ni, cld, alpha, beta); } } } )" // End of the C++11 raw string literal // ================================================================================================= CLBlast-1.6.3/src/kernels/level3/xgemm_part4.opencl000066400000000000000000000122671463263031500220750ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This is part 4 of 4 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // The upper-triangular and lower-triangular kernels are only used in special cases #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) // Main entry point of the kernel. This is the upper-triangular version. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) #endif void XgemmUpper(const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); // Skip these threads if they do not contain threads contributing to the upper-triangle if ((GetGroupID1() + 1)*NWG < GetGroupID0()*MWG) { return; } // Allocates workgroup-private memory (local memory) #if SA == 1 __local realM alm[KWG * MWG/VWM]; #endif #if SB == 1 __local realN blm[KWG * NWG/VWN]; #endif // Computes the matrix-multiplication and stores the result in global memory #if SA == 1 && SB == 1 XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm, blm); #elif SA == 1 XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm); #elif SB == 1 XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, blm); #else XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta); #endif } // Main entry point of the kernel. This is the lower-triangular version. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) #endif void XgemmLower(const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); // Skip these threads if they do not contain threads contributing to the lower-triangle if (GetGroupID1()*NWG > (GetGroupID0() + 1)*MWG) { return; } // Allocates workgroup-private memory (local memory) #if SA == 1 __local realM alm[KWG * MWG/VWM]; #endif #if SB == 1 __local realN blm[KWG * NWG/VWN]; #endif // Computes the matrix-multiplication and stores the result in global memory #if SA == 1 && SB == 1 XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm, blm); #elif SA == 1 XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm); #elif SB == 1 XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, blm); #else XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta); #endif } // ================================================================================================= // If not using a triangular version, include the regular kernel #else // Main entry point of the kernel. This is the regular full version. #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) #endif void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm, const int b_offset, const int c_offset) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); // Adds the offsets (in case of use of a single temporary buffer for A, B, and C) bgm = &bgm[b_offset]; cgm = &cgm[c_offset]; // Allocates workgroup-private memory (local memory) #if SA == 1 __local realM alm[KWG * MWG/VWM]; #endif #if SB == 1 __local realN blm[KWG * NWG/VWN]; #endif // Computes the matrix-multiplication and stores the result in global memory #if SA == 1 && SB == 1 XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm, blm); #elif SA == 1 XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm); #elif SB == 1 XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, blm); #else XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta); #endif } #endif )" // End of the C++11 raw string literal // ================================================================================================= CLBlast-1.6.3/src/kernels/levelx/000077500000000000000000000000001463263031500165415ustar00rootroot00000000000000CLBlast-1.6.3/src/kernels/levelx/col2im.opencl000066400000000000000000000166741463263031500211460ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // This file contains the col2im kernel, taken from: // https://gist.github.com/vbkaisetsu/a98299df827f9a5245635f646c1d94be // Credits go to https://github.com/vbkaisetsu // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // Work-group size parameters re-used from the 'copy' kernel #ifndef COPY_DIMX #define COPY_DIMX 8 // Local workgroup size in the first dimension (w) #endif #ifndef COPY_DIMY #define COPY_DIMY 8 // Local workgroup size in the second dimension (h) #endif // ================================================================================================= inline int grid_ceil(const int x, const int step) { return x > 0 ? ((x - 1) / step + 1) * step : x / step * step; } // Main body of the kernel INLINE_FUNC void Xcol2im(const int input_h, const int input_w, const int channels, const int output_h, const int output_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int stride_bez_h, const int stride_bez_w, const int dilation_bez_h, const int dilation_bez_w, const int gcd_h, const int gcd_w, const bool kernel_flip, const __global real* restrict col_buffer, const int col_offset, __global real* im_buffer, const int im_offset) { const int input_h_scaled = (input_h - 1) / gcd_h + 1; // Thread IDs const int gcd_scale_w = get_global_id(0) + (pad_w - 1) / gcd_w + 1; const int gcd_scale_h = ((int) get_global_id(1)) % input_h_scaled + (pad_h - 1) / gcd_h + 1; const int c_id = ((int) get_global_id(1)) / input_h_scaled; const int w_index = gcd_scale_w * gcd_w - pad_w; const int h_index = gcd_scale_h * gcd_h - pad_h; const int th_step = stride_h * dilation_h / gcd_h; const int th_begin = grid_ceil(max(-stride_bez_h * gcd_scale_h * stride_h, (dilation_bez_h * gcd_scale_h - kernel_h + 1) * dilation_h), th_step); const int th_end = min((output_h - stride_bez_h * gcd_scale_h) * stride_h, (dilation_bez_h * gcd_scale_h + 1) * dilation_h); const int tw_step = stride_w * dilation_w / gcd_w; const int tw_begin = grid_ceil(max(-stride_bez_w * gcd_scale_w * stride_w, (dilation_bez_w * gcd_scale_w - kernel_w + 1) * dilation_w), tw_step); const int tw_end = min((output_w - stride_bez_w * gcd_scale_w) * stride_w, (dilation_bez_w * gcd_scale_w + 1) * dilation_w); if (w_index < input_w && c_id < channels) { real val; SetToZero(val); for (int th = th_begin; th < th_end; th += th_step) { for (int tw = tw_begin; tw < tw_end; tw += tw_step) { const int kh_id = -th / dilation_h + dilation_bez_h * gcd_scale_h; const int kw_id = -tw / dilation_w + dilation_bez_w * gcd_scale_w; const int h_id = th / stride_h + stride_bez_h * gcd_scale_h; const int w_id = tw / stride_w + stride_bez_w * gcd_scale_w; const int kernel_index = (kernel_flip) ? kernel_h * kernel_w - kw_id - kernel_w * kh_id - 1 : kw_id + kernel_w * kh_id; const int patch_index = w_id + output_w * h_id; const int output_index = patch_index + kernel_index * output_w * output_h + c_id * output_w * output_h * kernel_h * kernel_w; Add(val, val, col_buffer[output_index + col_offset]); } } // Accumulates the resulting value with the existing im-buffer (+= val) const int input_index = w_index + input_w * (h_index + input_h * c_id); real im_buffer_value = im_buffer[input_index + im_offset]; Add(im_buffer[input_index + im_offset], im_buffer_value, val); } } // ================================================================================================= // Kernel flip version of the Xcol2im kernel (for convolution) #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) #endif void Xcol2imKernelFlip(const int input_h, const int input_w, const int channels, const int output_h, const int output_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int stride_bez_h, const int stride_bez_w, const int dilation_bez_h, const int dilation_bez_w, const int gcd_h, const int gcd_w, const __global real* restrict col_buffer, const int col_offset, __global real* im_buffer, const int im_offset) { const bool kernel_flip = true; Xcol2im(input_h, input_w, channels, output_h, output_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, stride_bez_h, stride_bez_w, dilation_bez_h, dilation_bez_w, gcd_h, gcd_w, kernel_flip, col_buffer, col_offset, im_buffer, im_offset); } // Normal version of the Xcol2im kernel (for cross-correlation) #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) #endif void Xcol2imKernelNormal(const int input_h, const int input_w, const int channels, const int output_h, const int output_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int stride_bez_h, const int stride_bez_w, const int dilation_bez_h, const int dilation_bez_w, const int gcd_h, const int gcd_w, const __global real* restrict col_buffer, const int col_offset, __global real* im_buffer, const int im_offset) { const bool kernel_flip = false; Xcol2im(input_h, input_w, channels, output_h, output_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, stride_bez_h, stride_bez_w, dilation_bez_h, dilation_bez_w, gcd_h, gcd_w, kernel_flip, col_buffer, col_offset, im_buffer, im_offset); } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/levelx/im2col.opencl000066400000000000000000000127301463263031500211330ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the im2col kernel. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // Work-group size parameters re-used from the 'copy' kernel #ifndef COPY_DIMX #define COPY_DIMX 8 // Local workgroup size in the first dimension (w) #endif #ifndef COPY_DIMY #define COPY_DIMY 8 // Local workgroup size in the second dimension (h) #endif // ================================================================================================= // Main body of the kernel INLINE_FUNC void Xim2col(const int input_h, const int input_w, const int channels, const int output_h, const int output_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const bool kernel_flip, const __global real* restrict im_buffer, const int im_offset, __global real* col_buffer, const int col_offset) { // Thread IDs const int w_id = get_global_id(0); // image width, max 'output_w' const int h_id = ((int)get_global_id(1)) % output_h; // image height, max 'output_h' const int c_id = ((int)get_global_id(1)) / output_h; // input channels if (h_id < output_h && w_id < output_w && c_id < channels) { for (int kh_id = 0; kh_id < kernel_h; ++kh_id) { // kernel height for (int kw_id = 0; kw_id < kernel_w; ++kw_id) { // kernel width // Retrieves the input value const int h_index = -pad_h + kh_id * dilation_h + stride_h * h_id; const int w_index = -pad_w + kw_id * dilation_w + stride_w * w_id; real val; if (h_index >= 0 && h_index < input_h && w_index >= 0 && w_index < input_w) { const int input_index = w_index + input_w * (h_index + input_h * c_id); val = im_buffer[input_index + im_offset]; } else { SetToZero(val); } // Sets the output value const int kernel_index = (kernel_flip) ? kernel_h * kernel_w - kw_id - kernel_w * kh_id - 1 : kw_id + kernel_w * kh_id; const int patch_index = w_id + output_w * h_id; const int output_index = patch_index + kernel_index * output_w * output_h + c_id * output_w * output_h * kernel_h * kernel_w; col_buffer[output_index + col_offset] = val; } } } } // ================================================================================================= // Kernel flip version of the Xim2col kernel (for convolution) #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) #endif void Xim2colKernelFlip(const int input_h, const int input_w, const int channels, const int output_h, const int output_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const __global real* restrict im_buffer, const int im_offset, __global real* col_buffer, const int col_offset) { const bool kernel_flip = true; Xim2col(input_h, input_w, channels, output_h, output_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, kernel_flip, im_buffer, im_offset, col_buffer, col_offset); } // Normal version of the Xim2col kernel (for cross-correlation) #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) #endif void Xim2colKernelNormal(const int input_h, const int input_w, const int channels, const int output_h, const int output_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const __global real* restrict im_buffer, const int im_offset, __global real* col_buffer, const int col_offset) { const bool kernel_flip = false; Xim2col(input_h, input_w, channels, output_h, output_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, kernel_flip, im_buffer, im_offset, col_buffer, col_offset); } // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/levelx/xconvgemm_part1.opencl000066400000000000000000000133601463263031500230600ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the an implementation of 3D convolution on a 4D image using GEMM kernels. It // uses parameters from the direct GEMM kernel. This is the part with the loads from memory (1/2). // This uses "CONVGEMM_WITH_IM2COL" as a switch to select between direct convgemm or first running // the im2col kernel to create a 'col' temporary matrix. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= #if defined(ROUTINE_CONVGEMM) && !defined(CONVGEMM_WITH_IM2COL) // Loads global off-chip memory into thread-private register files. This function is specific for // loading the image input tensor. This includes a bounds check. INLINE_FUNC real GlobalToPrivateCheckedImage(const __global real* restrict imagegm, const int image_offset_batch, const int h_id, const int w_id, const int kwg, const int input_h, const int input_w, const int channels, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const bool kernel_flip) { // Im2col indices const int kernel_2d_index = kwg % (kernel_h * kernel_w); const int kw_id = (kernel_flip) ? kernel_w - kernel_2d_index % kernel_w - 1 : kernel_2d_index % kernel_w; const int kh_id = (kernel_flip) ? kernel_h - kernel_2d_index / kernel_w - 1 : kernel_2d_index / kernel_w; const int c_id = kwg / (kernel_h * kernel_w); const int h_index = -pad_h + kh_id * dilation_h + stride_h * h_id; const int w_index = -pad_w + kw_id * dilation_w + stride_w * w_id; // With bounds check real result; if (h_index >= 0 && h_index < input_h && w_index >= 0 && w_index < input_w) { const int image_index = w_index + input_w * (h_index + input_h * c_id); result = imagegm[image_index + image_offset_batch]; } else { SetToZero(result); } return result; } // Loads global off-chip memory into local (shared) memory on-chip. This function is specific for // loading the image input tensor. This includes a bounds check. INLINE_FUNC real GlobalToLocalCheckedImage(const __global real* restrict imagegm, LOCAL_PTR real* alm, const int image_offset_batch, const int output_w, const int kwg, const int input_h, const int input_w, const int channels, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const bool kernel_flip) { #if MDIMCD == MDIMAD const int la0 = get_local_id(0); const int la1 = get_local_id(1); #else const int tid = get_local_id(0) + MDIMCD*get_local_id(1); const int la0 = tid % MDIMAD; const int la1 = tid / MDIMAD; #endif #pragma unroll for (int _mia = 0; _mia < MWAD; _mia += 1) { #pragma unroll for (int _kia = 0; _kia < KWAD; _kia += 1) { // Computes the indices for the global memory int mg = _mia + la0*MWAD; int kg = _kia + la1*KWAD; int idm = mg + GetGroupID0()*WGD; int idk = kg + kwg; const int w_id = idm % output_w; const int h_id = idm / output_w; // Im2col indices const int kernel_2d_index = idk % (kernel_h * kernel_w); const int kw_id = (kernel_flip) ? kernel_w - kernel_2d_index % kernel_w - 1 : kernel_2d_index % kernel_w; const int kh_id = (kernel_flip) ? kernel_h - kernel_2d_index / kernel_w - 1 : kernel_2d_index / kernel_w; const int c_id = idk / (kernel_h * kernel_w); const int h_index = -pad_h + kh_id * dilation_h + stride_h * h_id; const int w_index = -pad_w + kw_id * dilation_w + stride_w * w_id; // Loads the data from global memory into the local memory if (h_index >= 0 && h_index < input_h && w_index >= 0 && w_index < input_w) { const int image_index = w_index + input_w * (h_index + input_h * c_id); const real result = imagegm[image_index + image_offset_batch]; alm[kg*(WGD + PADA) + mg] = result; } else { SetToZero(alm[kg*(WGD + PADA) + mg]); } } } } #endif // defined(ROUTINE_CONVGEMM) && !defined(CONVGEMM_WITH_IM2COL) // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/levelx/xconvgemm_part2.opencl000066400000000000000000000347401463263031500230660ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the an implementation of 3D convolution on a 4D image using GEMM kernels. It // uses parameters from the direct GEMM kernel. This part contains the main kernel (2/2). // This uses "CONVGEMM_WITH_IM2COL" as a switch to select between direct convgemm or first running // the im2col kernel to create a 'col' temporary matrix. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= #if defined(ROUTINE_CONVGEMM) // ConvGEMM kernel #if defined(CONVGEMM_WITH_IM2COL) #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void Xconvgemm(const int num_patches, const int num_kernels, const int patch_size, const __global realND* restrict kernelgm, const int kernel_offset, __global real* resultgm, const int result_offset, const int result_stride, const __global realMD* restrict colgm, const int col_offset, const int col_stride) #else INLINE_FUNC void Xconvgemm(const int num_patches, const int num_kernels, const int patch_size, const __global realND* restrict kernelgm, const int kernel_offset, __global real* resultgm, const int result_offset, const int result_stride, const __global realMD* restrict imagegm, const int image_offset, const int input_h, const int input_w, const int channels, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int output_h, const int output_w, LOCAL_PTR real* alm, LOCAL_PTR real* blm, const bool kernel_flip) #endif { // Batch offsets const int batch = get_group_id(2); #if defined(CONVGEMM_WITH_IM2COL) const int col_offset_batch = col_offset + col_stride * batch; #else const int image_offset_batch = image_offset + channels * input_h * input_w * batch; #endif const int result_offset_batch = result_offset + result_stride * batch; #if defined(CONVGEMM_WITH_IM2COL) __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; #endif // Extra pointers to scalar versions of global memory #if defined(CONVGEMM_WITH_IM2COL) const __global real* restrict colgms = (const __global real* restrict) colgm; #else const __global real* restrict imagegms = (const __global real* restrict) imagegm; #endif const __global real* restrict kernelgms = (const __global real* restrict) kernelgm; // Allocates workitem-private memory (registers) #pragma promote_to_registers real apd[MWID]; #pragma promote_to_registers real bpd[NWID]; #pragma promote_to_registers real cpd[NWID * MWID]; // Initializes the accumulation registers #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { SetToZero(cpd[_ni * MWID + _mi]); } } // Global m/n indices const int idm = get_local_id(0) * MWID + GetGroupID0() * WGD; const int idn = get_local_id(1) * NWID + GetGroupID1() * WGD; #if !defined(CONVGEMM_WITH_IM2COL) const int w_id = idm % output_w; const int h_id = idm / output_w; #endif // The faster version of GEMM is not allowed on the (incomplete) borders. Therefore, this section // processes only the main parts: output blocks of WGD by WGD. if ((idm < (num_patches/WGD)*WGD) && (idn < (num_kernels/WGD)*WGD)) { // Loops over all complete workgroup tiles (K-dimension) int kwg = 0; for (; kwg < (patch_size/WGD) * WGD; kwg += WGD) { // Loads data: off-chip --> local (matrix A and B) #if defined(CONVGEMM_WITH_IM2COL) if (num_patches % VWMD == 0 && col_offset_batch % VWMD == 0) { GlobalToLocalDirectA(colgm, alm, num_patches, col_offset_batch, kwg, false, false); } else { GlobalToLocalScalarA(colgms, alm, num_patches, col_offset_batch, kwg, false, false); } #else GlobalToLocalCheckedImage(imagegms, alm, image_offset_batch, output_w, kwg, input_h, input_w, channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, kernel_flip); #endif if (patch_size % VWND == 0 && kernel_offset % VWND == 0) { GlobalToLocalDirectB(kernelgm, blm, patch_size, kernel_offset, kwg, true, false); } else { GlobalToLocalScalarB(kernelgms, blm, patch_size, kernel_offset, kwg, true, false); } barrier(CLK_LOCAL_MEM_FENCE); // Loops over all workitem tiles, unrolled by a factor KWID for (int pwi = 0; pwi < WGD; pwi += KWID) { #pragma unroll for (int _pit = 0; _pit < KWID; _pit += 1) { int kg = pwi + _pit; // Loads data: local --> private (matrix A and B) #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { apd[_mi] = LocalToPrivateDirectA(alm, _mi, kg, false); } #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { bpd[_ni] = LocalToPrivateDirectB(blm, _ni, kg, true); } // Performs the accumulation (Cpmd += Apmd * Bpmd) #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); } } } } barrier(CLK_LOCAL_MEM_FENCE); } // Loop over the remaining part (incomplete tile in K-dimension) for (; kwg < patch_size; ++kwg) { // Loads data: off-chip --> private (matrix A and B) #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { #if defined(CONVGEMM_WITH_IM2COL) apd[_mi] = GlobalToPrivateDirectA(colgms, _mi, num_patches, col_offset_batch, idm, kwg, false, false); #else const int w_id = (idm + _mi) % output_w; const int h_id = (idm + _mi) / output_w; apd[_mi] = GlobalToPrivateCheckedImage(imagegms, image_offset_batch, h_id, w_id, kwg, input_h, input_w, channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, kernel_flip); #endif } #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { bpd[_ni] = GlobalToPrivateDirectB(kernelgms, _ni, patch_size, kernel_offset, idn, kwg, true, false); } // Performs the accumulation (Cpmd += Apmd * Bpmd) #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); } } } // Stores a tile of results #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { StoreResultsDirect(resultgm, cpd[_ni * MWID + _mi], _mi, _ni, idm, idn, ONE, ZERO, num_patches, result_offset_batch, false); } } } // Simple but slower version for the parts on the edge (incomplete tiles in M and N-dimensions) else { // Loops over all complete workgroup tiles (K-dimension) int kwg = 0; for (; kwg < (patch_size/WGD) * WGD; kwg+=WGD) { // Loads data: off-chip --> local #if defined(CONVGEMM_WITH_IM2COL) GlobalToLocalCheckedA(colgms, alm, num_patches, col_offset_batch, kwg, false, false, num_patches, patch_size); #else GlobalToLocalCheckedImage(imagegms, alm, image_offset_batch, output_w, kwg, input_h, input_w, channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, kernel_flip); #endif GlobalToLocalCheckedB(kernelgms, blm, patch_size, kernel_offset, kwg, true, false, num_kernels, patch_size); barrier(CLK_LOCAL_MEM_FENCE); // Loops over all workitem tiles, unrolled by a factor KWID for (int pwi = 0; pwi < WGD; pwi += KWID) { #pragma unroll for (int _pit = 0; _pit < KWID; _pit += 1) { int kg = pwi + _pit; // Loads data: local --> private #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { apd[_mi] = LocalToPrivateDirectA(alm, _mi, kg, false); } #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { bpd[_ni] = LocalToPrivateDirectB(blm, _ni, kg, true); } // Performs the accumulation (C += A * B) #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); } } } } barrier(CLK_LOCAL_MEM_FENCE); } // Loop over the remaining part (incomplete tile in K-dimension) for (; kwg < patch_size; ++kwg) { // Loads data: off-chip --> private #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { #if defined(CONVGEMM_WITH_IM2COL) apd[_mi] = GlobalToPrivateCheckedA(colgms, _mi, num_patches, col_offset_batch, idm, kwg, false, false, num_patches); #else const int w_id = (idm + _mi) % output_w; const int h_id = (idm + _mi) / output_w; apd[_mi] = GlobalToPrivateCheckedImage(imagegms, image_offset_batch, h_id, w_id, kwg, input_h, input_w, channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, kernel_flip); #endif } #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { bpd[_ni] = GlobalToPrivateCheckedB(kernelgms, _ni, patch_size, kernel_offset, idn, kwg, true, false, num_kernels); } // Performs the accumulation (C += A * B) #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); } } } // Stores a tile of results #pragma unroll for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll for (int _mi = 0; _mi < MWID; _mi += 1) { StoreResultsChecked(resultgm, cpd[_ni * MWID + _mi], _mi, _ni, idm, idn, num_patches, num_kernels, ONE, ZERO, num_patches, result_offset_batch, false); } } } } #if !defined(CONVGEMM_WITH_IM2COL) #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XconvgemmFlip(const int num_patches, const int num_kernels, const int patch_size, const __global realND* restrict kernelgm, const int kernel_offset, __global real* resultgm, const int result_offset, const int result_stride, const __global realMD* restrict imagegm, const int image_offset, const int input_h, const int input_w, const int channels, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int output_h, const int output_w) { const bool kernel_flip = true; __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; Xconvgemm(num_patches, num_kernels, patch_size, kernelgm, kernel_offset, resultgm, result_offset, result_stride, imagegm, image_offset, input_h, input_w, channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, output_h, output_w, alm, blm, kernel_flip); } #if RELAX_WORKGROUP_SIZE == 1 __kernel #else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XconvgemmNormal(const int num_patches, const int num_kernels, const int patch_size, const __global realND* restrict kernelgm, const int kernel_offset, __global real* resultgm, const int result_offset, const int result_stride, const __global realMD* restrict imagegm, const int image_offset, const int input_h, const int input_w, const int channels, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int output_h, const int output_w) { const bool kernel_flip = false; __local real alm[WGD * (WGD + PADA)]; __local real blm[WGD * (WGD + PADB)]; Xconvgemm(num_patches, num_kernels, patch_size, kernelgm, kernel_offset, resultgm, result_offset, result_stride, imagegm, image_offset, input_h, input_w, channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, output_h, output_w, alm, blm, kernel_flip); } #endif // !defined(CONVGEMM_WITH_IM2COL) #endif // defined(ROUTINE_CONVGEMM) // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/kernels/opencl_to_cuda.h000066400000000000000000000070311463263031500203720ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains an (incomplete) header to interpret OpenCL kernels as CUDA kernels. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // CLBlast specific additions #define CUDA 1 #define LOCAL_PTR // pointers to local memory don't have to be annotated in CUDA // Replaces the OpenCL get_xxx_ID with CUDA equivalents __device__ int get_local_id(const int x) { if (x == 0) { return threadIdx.x; } if (x == 1) { return threadIdx.y; } return threadIdx.z; } __device__ int get_group_id(const int x) { if (x == 0) { return blockIdx.x; } if (x == 1) { return blockIdx.y; } return blockIdx.z; } __device__ int get_local_size(const int x) { if (x == 0) { return blockDim.x; } if (x == 1) { return blockDim.y; } return blockDim.z; } __device__ int get_num_groups(const int x) { if (x == 0) { return gridDim.x; } if (x == 1) { return gridDim.y; } return gridDim.z; } __device__ int get_global_size(const int x) { if (x == 0) { return gridDim.x * blockDim.x; } if (x == 1) { return gridDim.y * blockDim.y; } return gridDim.z * blockDim.z; } __device__ int get_global_id(const int x) { if (x == 0) { return blockIdx.x*blockDim.x + threadIdx.x; } if (x == 1) { return blockIdx.y*blockDim.y + threadIdx.y; } return blockIdx.z*blockDim.z + threadIdx.z; } // Adds the data-types which are not available natively under CUDA typedef struct { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; } float8; typedef struct { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float s10; float s11; float s12; float s13; float s14; float s15; } float16; typedef struct { double s0; double s1; double s2; double s3; double s4; double s5; double s6; double s7; } double8; typedef struct { double s0; double s1; double s2; double s3; double s4; double s5; double s6; double s7; double s8; double s9; double s10; double s11; double s12; double s13; double s14; double s15; } double16; // Replaces the OpenCL keywords with CUDA equivalent #define __kernel __placeholder__ #define __global #define __placeholder__ extern "C" __global__ #define __local __shared__ #define restrict __restrict__ #define __constant const #define inline __device__ // assumes all device functions are annotated with inline in OpenCL // Kernel attributes (don't replace currently) #define reqd_work_group_size(x, y, z) // Replaces OpenCL synchronisation with CUDA synchronisation #define barrier(x) __syncthreads() // ================================================================================================= // End of the C++11 raw string literal )" // ================================================================================================= CLBlast-1.6.3/src/pyclblast/000077500000000000000000000000001463263031500155745ustar00rootroot00000000000000CLBlast-1.6.3/src/pyclblast/CMakeLists.txt000066400000000000000000000041651463263031500203420ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.20) project(${SKBUILD_PROJECT_NAME} LANGUAGES CXX) # Find python and numpy find_package( Python3 REQUIRED COMPONENTS Interpreter Development.Module NumPy ) # Run the cython compiler cmake_path(APPEND CMAKE_CURRENT_SOURCE_DIR "./src" OUTPUT_VARIABLE Cython_SOURCE_DIR) find_program(CYTHON "cython") add_custom_command( OUTPUT "${Cython_SOURCE_DIR}/pyclblast.cpp" DEPENDS "${Cython_SOURCE_DIR}/pyclblast.pyx" VERBATIM COMMAND "${CYTHON}" -3 "${Cython_SOURCE_DIR}/pyclblast.pyx" --output-file "${Cython_SOURCE_DIR}/pyclblast.cpp") # Add module target Python3_add_library(pyclblast MODULE WITH_SOABI "${Cython_SOURCE_DIR}/pyclblast.cpp") # Numpy libraries - NOTE: clean NPY_LIBRARIES cache (may fail for venv) cmake_path(GET Python3_NumPy_INCLUDE_DIRS PARENT_PATH Python3_NumPy_CORE_DIR) unset(NPY_LIBRARIES CACHE) find_library(NPY_LIBRARIES NAMES npymath PATHS ${Python3_NumPy_CORE_DIR} PATH_SUFFIXES lib DOC "Numpy math library" REQUIRED NO_DEFAULT_PATH) target_link_libraries(pyclblast PRIVATE ${NPY_LIBRARIES}) target_include_directories(pyclblast PRIVATE ${Python3_NumPy_INCLUDE_DIRS}) # CLBlast library set(CLBLAST_HINTS ${CLBLAST_ROOT} $ENV{CLBLAST_ROOT} ) find_package(CLBlast CONFIG REQUIRED HINTS ${CLBLAST_HINTS}) target_link_libraries(pyclblast PRIVATE clblast) install(TARGETS pyclblast DESTINATION .) # In windows pyclblast cannot find the dll, even on path. # Probably related to change in 3.8, that loads dll only for trusted location # see https://stackoverflow.com/questions/41365446/how-to-resolve-importerror-dll-load-failed-on-python # One workaround is to copy the dll to the same dir as the module. # TODO: add python version check if (WIN32) cmake_path(APPEND CLBlast_DIR "../../../bin" OUTPUT_VARIABLE CLBlast_BINDIR) cmake_path(SET CLBlast_BINDIR NORMALIZE "${CLBlast_BINDIR}") unset(CLBlast_SHARED_LIBPATH CACHE) find_file(CLBlast_SHARED_LIBPATH NAMES clblast.dll PATHS ${CLBlast_BINDIR} DOC "CLBlast shared library" REQUIRED) # copy dll to build install(FILES ${CLBlast_SHARED_LIBPATH} DESTINATION .) endif() CLBlast-1.6.3/src/pyclblast/MANIFEST.in000066400000000000000000000001101463263031500173220ustar00rootroot00000000000000include README.md src/*.pyx include samples/*.py include CMakeLists.txt CLBlast-1.6.3/src/pyclblast/README.md000066400000000000000000000057051463263031500170620ustar00rootroot00000000000000 PyCLBlast: Python wrappers for the tuned OpenCL BLAS library CLBlast ================ This Python package provides a straightforward wrapper for CLBast based on PyOpenCL. CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices. See [the CLBlast repository](https://github.com/CNugteren/CLBlast) and [the CLBlast website](https://cnugteren.github.io/clblast) for more information about CLBlast. Prerequisites ------------- Non-Python requirements: * OpenCL * [CLBlast](https://github.com/CNugteren/CLBlast) Getting started ------------- After installing OpenCL and CLBlast, simply use pip to install PyCLBlast, e.g.: pip install --user pyclblast To start using the library, browse the [CLBlast](https://github.com/CNugteren/CLBlast) documentation or check out the PyCLBlast samples provided in the `samples` subfolder. For developers, install CLBlast and [cython](https://cython.org/) (e.g. in a Python3 virtualenv): pip install Cython And then compile the bindings from this location using pip: pip install . Detecting CLBlast ------------- The CLBlast library should be present and detectable to your system, to successfully install the PyCLBlast bindings. In some systems, this is done automatically. But if the CLBlast library cannot be detected, the PyCLBlast installation will fail. To ensure detection, one can apply either of the following: * Add the CLBLast root directory to the environment path. * Create the environment variable `CLBLAST_ROOT` that holds the path to the CLBLast root directory. * Define the `cmake` variables `CMAKE_PREFIX_PATH` or the `CLBLAST_ROOT` variable that point to the CLBlast root directory, as: pip install . -C skbuild.cmake.args="-DCMAKE_PREFIX_PATH=/root/path/to/clblast" * Create the environment variable `CLBlast_DIR` that holds the path to the directory where either of the `CLBlastConfig.cmake` or `clblast-config.cmake` files reside. Note that the aforementioned environment variables should be set only during the installation of PyCLBlast and can be unset during normal use. Testing PyCLBlast ------------- The main exhaustive tests are the main CLBlast test binaries. Apart from that, you can also run the PyCLBlast smoke tests from the `test` subfolder, e.g. as follows: python -m unittest discover How to release a new version on PyPi ------------- Following [the guide](https://packaging.python.org/tutorials/packaging-projects/), in essence doing (after changing the version number in `setup.py`): python3 -m build python3 -m twine upload --repository pypi dist/pyclblast-1.4.0.tar.gz # use '__token__' as username and supply the token from your PyPi account CLBlast-1.6.3/src/pyclblast/pyproject.toml000066400000000000000000000016771463263031500205230ustar00rootroot00000000000000[build-system] requires = ["scikit-build-core", "cython", "numpy"] build-backend = "scikit_build_core.build" [project] name = "pyclblast" version = "1.4.0" description = "Python bindings for CLBlast, the tuned OpenCL BLAS library" authors = [ {name = "Cedric Nugteren", email = "web@cedricnugteren.nl"} ] license = {text = "Apache Software License"} readme = "README.md" classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "Topic :: Software Development :: Libraries", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 2", "Programming Language :: Python :: 3", ] keywords = ["OpenCL", "BLAS", "CLBlast", "GEMM", "matrix-multiplication"] dependencies = [ "numpy", "pyopencl" ] [project.urls] Homepage = "https://github.com/CNugteren/CLBlast/blob/master/src/pyclblast" [tool.setuptools.packages.find] where = ["src"] CLBlast-1.6.3/src/pyclblast/samples/000077500000000000000000000000001463263031500172405ustar00rootroot00000000000000CLBlast-1.6.3/src/pyclblast/samples/haxpy.py000066400000000000000000000020371463263031500207450ustar00rootroot00000000000000#!/usr/bin/env python # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. # This file follows the PEP8 Python style guide and uses a max-width of 100 characters per line. # # Author(s): # Cedric Nugteren import numpy as np import pyopencl as cl from pyopencl.array import Array import pyclblast # Settings for this sample dtype = 'float16' alpha = 1.5 alpha_fp16 = pyclblast.float32_to_float16(alpha) n = 4 print("# Setting up OpenCL") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("# Setting up Numpy arrays") x = np.linspace(1.0, n, num=n).astype(dtype=dtype) y = np.linspace(1.0, n / 2, num=n).astype(dtype=dtype) print("# Setting up OpenCL arrays") clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) clx.set(x) cly.set(y) print("# Example level-1 operation: AXPY") pyclblast.axpy(queue, n, clx, cly, alpha=alpha_fp16) queue.finish() print("# Result for vector y: %s" % cly.get()) print("# Expected result: %s" % (alpha * x + y)) CLBlast-1.6.3/src/pyclblast/samples/override_parameters.py000066400000000000000000000030521463263031500236540ustar00rootroot00000000000000#!/usr/bin/env python # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. # This file follows the PEP8 Python style guide and uses a max-width of 100 characters per line. # # Author(s): # Cedric Nugteren import numpy as np import pyopencl as cl from pyopencl.array import Array import pyclblast from datetime import datetime if __name__ == "__main__": # Set up pyopencl: ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # Set up a basic sgemm example: m, n, k = 2, 3, 4 a = np.random.rand(m, k).astype(dtype=np.float32) b = np.random.rand(k, n).astype(dtype=np.float32) c = np.empty((m, n), np.float32) cla = Array(queue, a.shape, a.dtype) clb = Array(queue, b.shape, b.dtype) clc = Array(queue, c.shape, c.dtype) cla.set(a) clb.set(b) clc.set(c) # Perform sgemm on these matrices, overriding the CLBlast parameters. In this example, we'll # just change the 'MWG' parameter a couple of times: params = { "KWG": 32, "KWI": 2, "MDIMA": 8, "MDIMC": 8, "MWG": 64, "NDIMB": 8, "NDIMC": 8, "NWG": 64, "SA": 0, "SB": 0, "STRM": 0, "STRN": 0, "VWM": 4, "VWN": 1 } for mwg in (32, 64, 256): print("Running sgemm tuned with MWG = %d" % mwg) params["MWG"] = mwg pyclblast.override_parameters(ctx.devices[0], 'Xgemm', 32, params) pyclblast.gemm(queue, m, n, k, cla, clb, clc, a_ld=k, b_ld=n, c_ld=n) assert np.allclose(clc.get(), a.dot(b)), "uh-oh, xgemm isn't behaving correctly" CLBlast-1.6.3/src/pyclblast/samples/saxpy.py000066400000000000000000000017231463263031500207610ustar00rootroot00000000000000#!/usr/bin/env python # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. # This file follows the PEP8 Python style guide and uses a max-width of 100 characters per line. # # Author(s): # Cedric Nugteren import numpy as np import pyopencl as cl from pyopencl.array import Array import pyclblast # Settings for this sample dtype = 'float32' alpha = 1.5 n = 4 print("# Setting up OpenCL") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("# Setting up Numpy arrays") x = np.random.rand(n).astype(dtype=dtype) y = np.random.rand(n).astype(dtype=dtype) print("# Setting up OpenCL arrays") clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) clx.set(x) cly.set(y) print("# Example level-1 operation: AXPY") pyclblast.axpy(queue, n, clx, cly, alpha=alpha) queue.finish() print("# Result for vector y: %s" % cly.get()) print("# Expected result: %s" % (alpha * x + y)) CLBlast-1.6.3/src/pyclblast/samples/saxpybatched.py000066400000000000000000000025271463263031500222770ustar00rootroot00000000000000#!/usr/bin/env python # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. # This file follows the PEP8 Python style guide and uses a max-width of 100 characters per line. # # Author(s): # Cedric Nugteren import numpy as np import pyopencl as cl from pyopencl.array import Array import pyclblast # Settings for this sample: batch_count = 2 dtype = 'float32' alphas = [1.5, 1.0] n = 4 print("# Setting up OpenCL") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("# Setting up Numpy arrays") x = np.random.rand(n * batch_count).astype(dtype=dtype) y = np.random.rand(n * batch_count).astype(dtype=dtype) print("# Batch offsets: next after each other") x_offsets = [0, n] y_offsets = [0, n] print("# Setting up OpenCL arrays") clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) clx.set(x) cly.set(y) print("# Example level-1 batched operation: AXPY-batched") assert len(alphas) == len(x_offsets) == len(y_offsets) == batch_count pyclblast.axpyBatched(queue, n, clx, cly, alphas, x_offsets, y_offsets) queue.finish() print("# Full result for vector y: %s" % str(cly.get())) for i in range(batch_count): result = alphas[i] * x[x_offsets[i]:x_offsets[i] + n] + y[y_offsets[i]:y_offsets[i] + n] print("# Expected result batch #%d: %s" % (i, str(result))) CLBlast-1.6.3/src/pyclblast/samples/sgemm.py000066400000000000000000000021031463263031500207160ustar00rootroot00000000000000#!/usr/bin/env python # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. # This file follows the PEP8 Python style guide and uses a max-width of 100 characters per line. # # Author(s): # Cedric Nugteren import numpy as np import pyopencl as cl from pyopencl.array import Array import pyclblast # Settings for this sample dtype = 'float32' print("# Setting up OpenCL") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("# Setting up Numpy arrays") m, n, k = 2, 3, 4 a = np.random.rand(m, k).astype(dtype=dtype) b = np.random.rand(k, n).astype(dtype=dtype) c = np.random.rand(m, n).astype(dtype=dtype) print("# Setting up OpenCL arrays") cla = Array(queue, a.shape, a.dtype) clb = Array(queue, b.shape, b.dtype) clc = Array(queue, c.shape, c.dtype) cla.set(a) clb.set(b) clc.set(c) print("# Example level-3 operation: GEMM") pyclblast.gemm(queue, m, n, k, cla, clb, clc, a_ld=k, b_ld=n, c_ld=n) queue.finish() print("# Matrix C result: %s" % clc.get()) print("# Expected result: %s" % (np.dot(a, b))) CLBlast-1.6.3/src/pyclblast/samples/sgemv.py000066400000000000000000000021561463263031500207370ustar00rootroot00000000000000#!/usr/bin/env python # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. # This file follows the PEP8 Python style guide and uses a max-width of 100 characters per line. # # Author(s): # Cedric Nugteren import numpy as np import pyopencl as cl from pyopencl.array import Array import pyclblast # Settings for this sample dtype = 'float32' m, n = 4, 3 alpha = 1.0 beta = 0.0 print("# Setting up OpenCL") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("# Setting up Numpy arrays") a = np.random.rand(m, n).astype(dtype=dtype) x = np.random.rand(n).astype(dtype=dtype) y = np.random.rand(m).astype(dtype=dtype) print("# Setting up OpenCL arrays") cla = Array(queue, a.shape, a.dtype) clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) cla.set(a) clx.set(x) cly.set(y) print("# Example level-2 operation: GEMV") pyclblast.gemv(queue, m, n, cla, clx, cly, a_ld=n, alpha=alpha, beta=beta) queue.finish() print("# Result for vector y: %s" % cly.get()) print("# Expected result: %s" % (alpha * np.dot(a, x) + beta * y)) CLBlast-1.6.3/src/pyclblast/src/000077500000000000000000000000001463263031500163635ustar00rootroot00000000000000CLBlast-1.6.3/src/pyclblast/src/pyclblast.pyx000066400000000000000000005455561463263031500211460ustar00rootroot00000000000000#distutils: language = c++ #cython: binding=True #################################################################################################### # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. # # Author(s): # Cedric Nugteren # # This file defines the Python interface to CLBlast. It is inspired by: # https://github.com/hunse/pyopencl_blas # #################################################################################################### import binascii import struct import numpy as np import pyopencl as cl from pyopencl.array import Array from libcpp cimport bool from cpython.mem cimport PyMem_Malloc, PyMem_Free from libc.string cimport strdup from libc.stdint cimport uint16_t #################################################################################################### # CLBlast and OpenCL data-types #################################################################################################### cdef extern from "clblast_c.h": # Status codes ctypedef enum CLBlastStatusCode: CLBlastSuccess CLBlastOpenCLCompilerNotAvailable CLBlastTempBufferAllocFailure CLBlastOpenCLOutOfResources CLBlastOpenCLOutOfHostMemory CLBlastOpenCLBuildProgramFailure CLBlastInvalidValue CLBlastInvalidCommandQueue CLBlastInvalidMemObject CLBlastInvalidBinary CLBlastInvalidBuildOptions CLBlastInvalidProgram CLBlastInvalidProgramExecutable CLBlastInvalidKernelName CLBlastInvalidKernelDefinition CLBlastInvalidKernel CLBlastInvalidArgIndex CLBlastInvalidArgValue CLBlastInvalidArgSize CLBlastInvalidKernelArgs CLBlastInvalidLocalNumDimensions CLBlastInvalidLocalThreadsTotal CLBlastInvalidLocalThreadsDim CLBlastInvalidGlobalOffset CLBlastInvalidEventWaitList CLBlastInvalidEvent CLBlastInvalidOperation CLBlastInvalidBufferSize CLBlastInvalidGlobalWorkSize CLBlastNotImplemented CLBlastInvalidMatrixA CLBlastInvalidMatrixB CLBlastInvalidMatrixC CLBlastInvalidVectorX CLBlastInvalidVectorY CLBlastInvalidDimension CLBlastInvalidLeadDimA CLBlastInvalidLeadDimB CLBlastInvalidLeadDimC CLBlastInvalidIncrementX CLBlastInvalidIncrementY CLBlastInsufficientMemoryA CLBlastInsufficientMemoryB CLBlastInsufficientMemoryC CLBlastInsufficientMemoryX CLBlastInsufficientMemoryY CLBlastInvalidBatchCount CLBlastInvalidOverrideKernel CLBlastMissingOverrideParameter CLBlastInvalidLocalMemUsage CLBlastNoHalfPrecision CLBlastNoDoublePrecision CLBlastInvalidVectorScalar CLBlastInsufficientMemoryScalar CLBlastDatabaseError CLBlastUnknownError CLBlastUnexpectedError # OpenCL data-types ctypedef float cl_float ctypedef double cl_double ctypedef unsigned int cl_uint ctypedef struct cl_float2: cl_float x cl_float y ctypedef struct cl_double2: cl_double x cl_double y ctypedef unsigned short cl_half # OpenCL special data-types struct _cl_mem: pass struct _cl_command_queue: pass struct _cl_event: pass ctypedef _cl_mem* cl_mem ctypedef _cl_command_queue* cl_command_queue ctypedef _cl_event* cl_event # Matrix layout and transpose types ctypedef enum CLBlastLayout: CLBlastLayoutRowMajor CLBlastLayoutColMajor ctypedef enum CLBlastTranspose: CLBlastTransposeNo CLBlastTransposeYes CLBlastTransposeConjugate ctypedef enum CLBlastTriangle: CLBlastTriangleUpper CLBlastTriangleLower ctypedef enum CLBlastDiagonal: CLBlastDiagonalNonUnit CLBlastDiagonalUnit ctypedef enum CLBlastSide: CLBlastSideLeft CLBlastSideRight # Precision enum ctypedef enum CLBlastPrecision: CLBlastPrecisionSingle CLBlastPrecisionDouble CLBlastPrecisionComplexSingle CLBlastPrecisionComplexDouble # Translates status codes into readable messages cdef get_status_message(CLBlastStatusCode status): if status == CLBlastSuccess: return "CLBlastSuccess" if status == CLBlastOpenCLCompilerNotAvailable: return "CLBlastOpenCLCompilerNotAvailable: CL_COMPILER_NOT_AVAILABLE" if status == CLBlastTempBufferAllocFailure: return "CLBlastTempBufferAllocFailure: CL_MEM_OBJECT_ALLOCATION_FAILURE" if status == CLBlastOpenCLOutOfResources: return "CLBlastOpenCLOutOfResources: CL_OUT_OF_RESOURCES" if status == CLBlastOpenCLOutOfHostMemory: return "CLBlastOpenCLOutOfHostMemory: CL_OUT_OF_HOST_MEMORY" if status == CLBlastOpenCLBuildProgramFailure: return "CLBlastOpenCLBuildProgramFailure: CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error" if status == CLBlastInvalidValue: return "CLBlastInvalidValue: CL_INVALID_VALUE" if status == CLBlastInvalidCommandQueue: return "CLBlastInvalidCommandQueue: CL_INVALID_COMMAND_QUEUE" if status == CLBlastInvalidMemObject: return "CLBlastInvalidMemObject: CL_INVALID_MEM_OBJECT" if status == CLBlastInvalidBinary: return "CLBlastInvalidBinary: CL_INVALID_BINARY" if status == CLBlastInvalidBuildOptions: return "CLBlastInvalidBuildOptions: CL_INVALID_BUILD_OPTIONS" if status == CLBlastInvalidProgram: return "CLBlastInvalidProgram: CL_INVALID_PROGRAM" if status == CLBlastInvalidProgramExecutable: return "CLBlastInvalidProgramExecutable: CL_INVALID_PROGRAM_EXECUTABLE" if status == CLBlastInvalidKernelName: return "CLBlastInvalidKernelName: CL_INVALID_KERNEL_NAME" if status == CLBlastInvalidKernelDefinition: return "CLBlastInvalidKernelDefinition: CL_INVALID_KERNEL_DEFINITION" if status == CLBlastInvalidKernel: return "CLBlastInvalidKernel: CL_INVALID_KERNEL" if status == CLBlastInvalidArgIndex: return "CLBlastInvalidArgIndex: CL_INVALID_ARG_INDEX" if status == CLBlastInvalidArgValue: return "CLBlastInvalidArgValue: CL_INVALID_ARG_VALUE" if status == CLBlastInvalidArgSize: return "CLBlastInvalidArgSize: CL_INVALID_ARG_SIZE" if status == CLBlastInvalidKernelArgs: return "CLBlastInvalidKernelArgs: CL_INVALID_KERNEL_ARGS" if status == CLBlastInvalidLocalNumDimensions: return "CLBlastInvalidLocalNumDimensions: CL_INVALID_WORK_DIMENSION: Too many thread dimensions" if status == CLBlastInvalidLocalThreadsTotal: return "CLBlastInvalidLocalThreadsTotal: CL_INVALID_WORK_GROUP_SIZE: Too many threads in total" if status == CLBlastInvalidLocalThreadsDim: return "CLBlastInvalidLocalThreadsDim: CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension" if status == CLBlastInvalidGlobalOffset: return "CLBlastInvalidGlobalOffset: CL_INVALID_GLOBAL_OFFSET" if status == CLBlastInvalidEventWaitList: return "CLBlastInvalidEventWaitList: CL_INVALID_EVENT_WAIT_LIST" if status == CLBlastInvalidEvent: return "CLBlastInvalidEvent: CL_INVALID_EVENT" if status == CLBlastInvalidOperation: return "CLBlastInvalidOperation: CL_INVALID_OPERATION" if status == CLBlastInvalidBufferSize: return "CLBlastInvalidBufferSize: CL_INVALID_BUFFER_SIZE" if status == CLBlastInvalidGlobalWorkSize: return "CLBlastInvalidGlobalWorkSize: CL_INVALID_GLOBAL_WORK_SIZE" if status == CLBlastNotImplemented: return "CLBlastNotImplemented: Routine or functionality not implemented yet" if status == CLBlastInvalidMatrixA: return "CLBlastInvalidMatrixA: Matrix A is not a valid OpenCL buffer" if status == CLBlastInvalidMatrixB: return "CLBlastInvalidMatrixB: Matrix B is not a valid OpenCL buffer" if status == CLBlastInvalidMatrixC: return "CLBlastInvalidMatrixC: Matrix C is not a valid OpenCL buffer" if status == CLBlastInvalidVectorX: return "CLBlastInvalidVectorX: Vector X is not a valid OpenCL buffer" if status == CLBlastInvalidVectorY: return "CLBlastInvalidVectorY: Vector Y is not a valid OpenCL buffer" if status == CLBlastInvalidDimension: return "CLBlastInvalidDimension: Dimensions M, N, and K have to be larger than zero" if status == CLBlastInvalidLeadDimA: return "CLBlastInvalidLeadDimA: LD of A is smaller than the matrix's first dimension" if status == CLBlastInvalidLeadDimB: return "CLBlastInvalidLeadDimB: LD of B is smaller than the matrix's first dimension" if status == CLBlastInvalidLeadDimC: return "CLBlastInvalidLeadDimC: LD of C is smaller than the matrix's first dimension" if status == CLBlastInvalidIncrementX: return "CLBlastInvalidIncrementX: Increment of vector X cannot be zero" if status == CLBlastInvalidIncrementY: return "CLBlastInvalidIncrementY: Increment of vector Y cannot be zero" if status == CLBlastInsufficientMemoryA: return "CLBlastInsufficientMemoryA: Matrix A's OpenCL buffer is too small" if status == CLBlastInsufficientMemoryB: return "CLBlastInsufficientMemoryB: Matrix B's OpenCL buffer is too small" if status == CLBlastInsufficientMemoryC: return "CLBlastInsufficientMemoryC: Matrix C's OpenCL buffer is too small" if status == CLBlastInsufficientMemoryX: return "CLBlastInsufficientMemoryX: Vector X's OpenCL buffer is too small" if status == CLBlastInsufficientMemoryY: return "CLBlastInsufficientMemoryY: Vector Y's OpenCL buffer is too small" if status == CLBlastInvalidBatchCount: return "CLBlastInvalidBatchCount: The batch count needs to be positive" if status == CLBlastInvalidOverrideKernel: return "CLBlastInvalidOverrideKernel: Trying to override parameters for an invalid kernel" if status == CLBlastMissingOverrideParameter: return "CLBlastMissingOverrideParameter: Missing override parameter(s) for the target kernel" if status == CLBlastInvalidLocalMemUsage: return "CLBlastInvalidLocalMemUsage: Not enough local memory available on this device" if status == CLBlastNoHalfPrecision: return "CLBlastNoHalfPrecision: Half precision (16-bits) not supported by the device" if status == CLBlastNoDoublePrecision: return "CLBlastNoDoublePrecision: Double precision (64-bits) not supported by the device" if status == CLBlastInvalidVectorScalar: return "CLBlastInvalidVectorScalar: The unit-sized vector is not a valid OpenCL buffer" if status == CLBlastInsufficientMemoryScalar: return "CLBlastInsufficientMemoryScalar: The unit-sized vector's OpenCL buffer is too small" if status == CLBlastDatabaseError: return "CLBlastDatabaseError: Entry for the device was not found in the database" if status == CLBlastUnknownError: return "CLBlastUnknownError: A catch-all error code representing an unspecified error" if status == CLBlastUnexpectedError: return "CLBlastUnexpectedError: A catch-all error code representing an unexpected exception" return "PyCLBlast: unrecognized CLBlast status code (code %d)" % status #################################################################################################### # Generic helpers #################################################################################################### dtype_size = {np.dtype('float32'): 4, np.dtype('float64'): 8, np.dtype('complex64'): 8, np.dtype('complex128'): 16} def dtypes_str(dtypes): if len(dtypes) == 1: return "'%s'" % dtypes[0] return "one of %s" % dtypes def check_dtype(args, dtypes): dtype = args[0].dtype if not all(arg.dtype == dtype for arg in args): raise ValueError("PyCLBlast: All arguments must have the same dtype (%s)" % dtypes_str(dtypes)) if dtype not in dtypes: raise ValueError("PyCLBlast: Data type must be %s" % dtypes_str(dtypes)) return dtype def check_array(a, ndim, name): if not isinstance(a, Array): raise ValueError("PyCLBlast: '%s' must be a PyOpenCL Array" % name) if not len(a.shape) == ndim: raise ValueError("PyCLBlast: '%s' must have %d dimensions (got %d)" % (name, ndim, len(a.shape))) def check_matrix(a, name): check_array(a, 2, name) def check_vector(a, name): check_array(a, 1, name) #################################################################################################### # Half-precision utility functions #################################################################################################### cdef extern from "numpy/halffloat.h": ctypedef uint16_t npy_half # conversion functions npy_half npy_float_to_half(float f) npy_half npy_double_to_half(double d) cdef npy_half val_to_half(object val): if isinstance(val, (np.float32, np.float16)): return npy_float_to_half(val) else: return npy_double_to_half(val) def float32_to_float16(float32): # Taken from https://gamedev.stackexchange.com/a/28756 F16_EXPONENT_BITS = 0x1F F16_EXPONENT_SHIFT = 10 F16_EXPONENT_BIAS = 15 F16_MANTISSA_BITS = 0x3ff F16_MANTISSA_SHIFT = (23 - F16_EXPONENT_SHIFT) F16_MAX_EXPONENT = (F16_EXPONENT_BITS << F16_EXPONENT_SHIFT) a = struct.pack('>f', float32) b = binascii.hexlify(a) f32 = int(b, 16) sign = (f32 >> 16) & 0x8000 exponent = ((f32 >> 23) & 0xff) - 127 mantissa = f32 & 0x007fffff if exponent == 128: f16 = sign | F16_MAX_EXPONENT if mantissa: f16 |= (mantissa & F16_MANTISSA_BITS) elif exponent > 15: f16 = sign | F16_MAX_EXPONENT elif exponent > -15: exponent += F16_EXPONENT_BIAS mantissa >>= F16_MANTISSA_SHIFT f16 = sign | exponent << F16_EXPONENT_SHIFT | mantissa else: f16 = sign return f16 #################################################################################################### # Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def swap(queue, n, x, y, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0): """ xSWAP: Swap two vectors """ dtype = check_dtype([x, y], ["float32", "float64", "complex64", "complex128", "float16"]) check_vector(x, "x") check_vector(y, "y") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSswap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDswap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCswap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZswap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHswap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXswap' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSscal(const size_t n, const float alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDscal(const size_t n, const double alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCscal(const size_t n, const cl_float2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZscal(const size_t n, const cl_double2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHscal(const size_t n, const cl_half alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) def scal(queue, n, x, x_inc = 1, alpha = 1.0, x_offset = 0): """ xSCAL: Vector scaling """ dtype = check_dtype([x], ["float32", "float64", "complex64", "complex128", "float16"]) check_vector(x, "x") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSscal(n, alpha, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDscal(n, alpha, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCscal(n, cl_float2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZscal(n, cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHscal(n, val_to_half(alpha), x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXscal' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastScopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def copy(queue, n, x, y, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0): """ xCOPY: Vector copy """ dtype = check_dtype([x, y], ["float32", "float64", "complex64", "complex128", "float16"]) check_vector(x, "x") check_vector(y, "y") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastScopy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDcopy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCcopy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZcopy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHcopy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXcopy' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSaxpy(const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDaxpy(const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCaxpy(const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZaxpy(const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHaxpy(const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def axpy(queue, n, x, y, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset = 0, y_offset = 0): """ xAXPY: Vector-times-constant plus vector """ dtype = check_dtype([x, y], ["float32", "float64", "complex64", "complex128", "float16"]) check_vector(x, "x") check_vector(y, "y") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSaxpy(n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDaxpy(n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCaxpy(n, cl_float2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZaxpy(n, cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHaxpy(n, val_to_half(alpha), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXaxpy' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Dot product of two vectors: SDOT/DDOT/HDOT #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def dot(queue, n, x, y, dot, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0, dot_offset = 0): """ xDOT: Dot product of two vectors """ dtype = check_dtype([x, y, dot], ["float32", "float64", "float16"]) check_vector(x, "x") check_vector(y, "y") check_matrix(dot, "dot") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_mem dot_buffer = dot.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSdot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDdot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHdot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXdot' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Dot product of two complex vectors: CDOTU/ZDOTU #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastCdotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZdotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def dotu(queue, n, x, y, dot, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0, dot_offset = 0): """ xDOTU: Dot product of two complex vectors """ dtype = check_dtype([x, y, dot], ["complex64", "complex128"]) check_vector(x, "x") check_vector(y, "y") check_matrix(dot, "dot") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_mem dot_buffer = dot.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastCdotu(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZdotu(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXdotu' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastCdotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZdotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def dotc(queue, n, x, y, dot, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0, dot_offset = 0): """ xDOTC: Dot product of two complex vectors, one conjugated """ dtype = check_dtype([x, y, dot], ["complex64", "complex128"]) check_vector(x, "x") check_vector(y, "y") check_matrix(dot, "dot") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_mem dot_buffer = dot.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastCdotc(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZdotc(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXdotc' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastScnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDznrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) def nrm2(queue, n, x, nrm2, x_inc = 1, x_offset = 0, nrm2_offset = 0): """ xNRM2: Euclidian norm of a vector """ dtype = check_dtype([x, nrm2], ["float32", "float64", "complex64", "complex128", "float16"]) check_vector(x, "x") check_matrix(nrm2, "nrm2") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem nrm2_buffer = nrm2.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSnrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDnrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastScnrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastDznrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHnrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXnrm2' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastScasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDzasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) def asum(queue, n, x, asum, x_inc = 1, x_offset = 0, asum_offset = 0): """ xASUM: Absolute sum of values in a vector """ dtype = check_dtype([x, asum], ["float32", "float64", "complex64", "complex128", "float16"]) check_vector(x, "x") check_matrix(asum, "asum") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem asum_buffer = asum.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSasum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDasum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastScasum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastDzasum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHasum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXasum' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastScsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDzsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) def sum(queue, n, x, sum, x_inc = 1, x_offset = 0, sum_offset = 0): """ xSUM: Sum of values in a vector (non-BLAS function) """ dtype = check_dtype([x, sum], ["float32", "float64", "complex64", "complex128", "float16"]) check_vector(x, "x") check_matrix(sum, "sum") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem sum_buffer = sum.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastScsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastDzsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsum' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastiSamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiDamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiCamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiZamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiHamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) def amax(queue, n, x, imax, x_inc = 1, x_offset = 0, imax_offset = 0): """ xAMAX: Index of absolute maximum value in a vector """ dtype = check_dtype([x], ["float32", "float64", "complex64", "complex128", "float16"]) check_dtype([imax], ["uint16", "uint32", "uint64"]) check_vector(x, "x") check_matrix(imax, "imax") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem imax_buffer = imax.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastiSamax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastiDamax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastiCamax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastiZamax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastiHamax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXamax' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastiSamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiDamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiCamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiZamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiHamin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) def amin(queue, n, x, imin, x_inc = 1, x_offset = 0, imin_offset = 0): """ xAMIN: Index of absolute minimum value in a vector (non-BLAS function) """ dtype = check_dtype([x], ["float32", "float64", "complex64", "complex128", "float16"]) check_dtype([imin], ["uint16", "uint32", "uint64"]) check_vector(x, "x") check_matrix(imin, "imin") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem imin_buffer = imin.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastiSamin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastiDamin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastiCamin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastiZamin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastiHamin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXamin' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastiSmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiDmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiCmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiZmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiHmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) def max(queue, n, x, imax, x_inc = 1, x_offset = 0, imax_offset = 0): """ xMAX: Index of maximum value in a vector (non-BLAS function) """ dtype = check_dtype([x], ["float32", "float64", "complex64", "complex128", "float16"]) check_dtype([imax], ["uint16", "uint32", "uint64"]) check_vector(x, "x") check_matrix(imax, "imax") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem imax_buffer = imax.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastiSmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastiDmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastiCmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastiZmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastiHmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXmax' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastiSmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiDmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiCmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiZmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastiHmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) def min(queue, n, x, imin, x_inc = 1, x_offset = 0, imin_offset = 0): """ xMIN: Index of minimum value in a vector (non-BLAS function) """ dtype = check_dtype([x], ["float32", "float64", "complex64", "complex128", "float16"]) check_dtype([imin], ["uint16", "uint32", "uint64"]) check_vector(x, "x") check_matrix(imin, "imin") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem imin_buffer = imin.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastiSmin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastiDmin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastiCmin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastiZmin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastiHmin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXmin' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def gemv(queue, m, n, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0, a_transp = False, a_offset = 0, x_offset = 0, y_offset = 0): """ xGEMV: General matrix-vector multiplication """ dtype = check_dtype([a, x, y], ["float32", "float64", "complex64", "complex128", "float16"]) check_matrix(a, "a") check_vector(x, "x") check_vector(y, "y") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSgemv(CLBlastLayoutRowMajor, a_transpose, m, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDgemv(CLBlastLayoutRowMajor, a_transpose, m, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCgemv(CLBlastLayoutRowMajor, a_transpose, m, n, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, cl_float2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZgemv(CLBlastLayoutRowMajor, a_transpose, m, n, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHgemv(CLBlastLayoutRowMajor, a_transpose, m, n, val_to_half(alpha), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, val_to_half(beta), y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXgemv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def gbmv(queue, m, n, kl, ku, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0, a_transp = False, a_offset = 0, x_offset = 0, y_offset = 0): """ xGBMV: General banded matrix-vector multiplication """ dtype = check_dtype([a, x, y], ["float32", "float64", "complex64", "complex128", "float16"]) check_matrix(a, "a") check_vector(x, "x") check_vector(y, "y") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSgbmv(CLBlastLayoutRowMajor, a_transpose, m, n, kl, ku, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDgbmv(CLBlastLayoutRowMajor, a_transpose, m, n, kl, ku, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCgbmv(CLBlastLayoutRowMajor, a_transpose, m, n, kl, ku, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, cl_float2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZgbmv(CLBlastLayoutRowMajor, a_transpose, m, n, kl, ku, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHgbmv(CLBlastLayoutRowMajor, a_transpose, m, n, kl, ku, val_to_half(alpha), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, val_to_half(beta), y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXgbmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Hermitian matrix-vector multiplication: CHEMV/ZHEMV #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastChemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def hemv(queue, n, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0, lower_triangle = False, a_offset = 0, x_offset = 0, y_offset = 0): """ xHEMV: Hermitian matrix-vector multiplication """ dtype = check_dtype([a, x, y], ["complex64", "complex128"]) check_matrix(a, "a") check_vector(x, "x") check_vector(y, "y") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastChemv(CLBlastLayoutRowMajor, triangle, n, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, cl_float2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZhemv(CLBlastLayoutRowMajor, triangle, n, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXhemv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastChbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def hbmv(queue, n, k, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0, lower_triangle = False, a_offset = 0, x_offset = 0, y_offset = 0): """ xHBMV: Hermitian banded matrix-vector multiplication """ dtype = check_dtype([a, x, y], ["complex64", "complex128"]) check_matrix(a, "a") check_vector(x, "x") check_vector(y, "y") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastChbmv(CLBlastLayoutRowMajor, triangle, n, k, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, cl_float2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZhbmv(CLBlastLayoutRowMajor, triangle, n, k, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXhbmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastChpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def hpmv(queue, n, ap, x, y, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0, lower_triangle = False, ap_offset = 0, x_offset = 0, y_offset = 0): """ xHPMV: Hermitian packed matrix-vector multiplication """ dtype = check_dtype([ap, x, y], ["complex64", "complex128"]) check_matrix(ap, "ap") check_vector(x, "x") check_vector(y, "y") cdef cl_mem ap_buffer = ap.base_data.int_ptr cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastChpmv(CLBlastLayoutRowMajor, triangle, n, cl_float2(x=alpha.real,y=alpha.imag), ap_buffer, ap_offset, x_buffer, x_offset, x_inc, cl_float2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZhpmv(CLBlastLayoutRowMajor, triangle, n, cl_double2(x=alpha.real,y=alpha.imag), ap_buffer, ap_offset, x_buffer, x_offset, x_inc, cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXhpmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def symv(queue, n, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0, lower_triangle = False, a_offset = 0, x_offset = 0, y_offset = 0): """ xSYMV: Symmetric matrix-vector multiplication """ dtype = check_dtype([a, x, y], ["float32", "float64", "float16"]) check_matrix(a, "a") check_vector(x, "x") check_vector(y, "y") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSsymv(CLBlastLayoutRowMajor, triangle, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDsymv(CLBlastLayoutRowMajor, triangle, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHsymv(CLBlastLayoutRowMajor, triangle, n, val_to_half(alpha), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, val_to_half(beta), y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsymv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def sbmv(queue, n, k, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0, lower_triangle = False, a_offset = 0, x_offset = 0, y_offset = 0): """ xSBMV: Symmetric banded matrix-vector multiplication """ dtype = check_dtype([a, x, y], ["float32", "float64", "float16"]) check_matrix(a, "a") check_vector(x, "x") check_vector(y, "y") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSsbmv(CLBlastLayoutRowMajor, triangle, n, k, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDsbmv(CLBlastLayoutRowMajor, triangle, n, k, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHsbmv(CLBlastLayoutRowMajor, triangle, n, k, val_to_half(alpha), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, val_to_half(beta), y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsbmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc,cl_command_queue* queue, cl_event* event) def spmv(queue, n, ap, x, y, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0, lower_triangle = False, ap_offset = 0, x_offset = 0, y_offset = 0): """ xSPMV: Symmetric packed matrix-vector multiplication """ dtype = check_dtype([ap, x, y], ["float32", "float64", "float16"]) check_matrix(ap, "ap") check_vector(x, "x") check_vector(y, "y") cdef cl_mem ap_buffer = ap.base_data.int_ptr cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSspmv(CLBlastLayoutRowMajor, triangle, n, alpha, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDspmv(CLBlastLayoutRowMajor, triangle, n, alpha, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHspmv(CLBlastLayoutRowMajor, triangle, n, val_to_half(alpha), ap_buffer, ap_offset, x_buffer, x_offset, x_inc, val_to_half(beta), y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXspmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastStrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) def trmv(queue, n, a, x, a_ld, x_inc = 1, lower_triangle = False, a_transp = False, unit_diagonal = False, a_offset = 0, x_offset = 0): """ xTRMV: Triangular matrix-vector multiplication """ dtype = check_dtype([a, x], ["float32", "float64", "complex64", "complex128", "float16"]) check_matrix(a, "a") check_vector(x, "x") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo diagonal = CLBlastDiagonalUnit if unit_diagonal else CLBlastDiagonalNonUnit cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastStrmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDtrmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCtrmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZtrmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHtrmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXtrmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastStbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) def tbmv(queue, n, k, a, x, a_ld, x_inc = 1, lower_triangle = False, a_transp = False, unit_diagonal = False, a_offset = 0, x_offset = 0): """ xTBMV: Triangular banded matrix-vector multiplication """ dtype = check_dtype([a, x], ["float32", "float64", "complex64", "complex128", "float16"]) check_matrix(a, "a") check_vector(x, "x") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo diagonal = CLBlastDiagonalUnit if unit_diagonal else CLBlastDiagonalNonUnit cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastStbmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDtbmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCtbmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZtbmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHtbmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXtbmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastStpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) def tpmv(queue, n, ap, x, ap_ld, x_inc = 1, lower_triangle = False, a_transp = False, unit_diagonal = False, ap_offset = 0, x_offset = 0): """ xTPMV: Triangular packed matrix-vector multiplication """ dtype = check_dtype([ap, x], ["float32", "float64", "complex64", "complex128", "float16"]) check_matrix(ap, "ap") check_vector(x, "x") cdef cl_mem ap_buffer = ap.base_data.int_ptr cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo diagonal = CLBlastDiagonalUnit if unit_diagonal else CLBlastDiagonalNonUnit cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastStpmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDtpmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCtpmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZtpmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHtpmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXtpmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc,cl_command_queue* queue, cl_event* event) def trsv(queue, n, a, x, a_ld, x_inc = 1, lower_triangle = False, a_transp = False, unit_diagonal = False, a_offset = 0, x_offset = 0): """ xTRSV: Solves a triangular system of equations """ dtype = check_dtype([a, x], ["float32", "float64", "complex64", "complex128"]) check_matrix(a, "a") check_vector(x, "x") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo diagonal = CLBlastDiagonalUnit if unit_diagonal else CLBlastDiagonalNonUnit cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastStrsv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDtrsv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCtrsv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZtrsv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXtrsv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # General rank-1 matrix update: SGER/DGER/HGER #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSger(const CLBlastLayout layout, const size_t m, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDger(const CLBlastLayout layout, const size_t m, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHger(const CLBlastLayout layout, const size_t m, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) def ger(queue, m, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset = 0, y_offset = 0, a_offset = 0): """ xGER: General rank-1 matrix update """ dtype = check_dtype([x, y, a], ["float32", "float64", "float16"]) check_vector(x, "x") check_vector(y, "y") check_matrix(a, "a") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSger(CLBlastLayoutRowMajor, m, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDger(CLBlastLayoutRowMajor, m, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHger(CLBlastLayoutRowMajor, m, n, val_to_half(alpha), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXger' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # General rank-1 complex matrix update: CGERU/ZGERU #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastCgeru(const CLBlastLayout layout, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgeru(const CLBlastLayout layout, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) def geru(queue, m, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset = 0, y_offset = 0, a_offset = 0): """ xGERU: General rank-1 complex matrix update """ dtype = check_dtype([x, y, a], ["complex64", "complex128"]) check_vector(x, "x") check_vector(y, "y") check_matrix(a, "a") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastCgeru(CLBlastLayoutRowMajor, m, n, cl_float2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZgeru(CLBlastLayoutRowMajor, m, n, cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXgeru' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # General rank-1 complex conjugated matrix update: CGERC/ZGERC #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastCgerc(const CLBlastLayout layout, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgerc(const CLBlastLayout layout, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) def gerc(queue, m, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset = 0, y_offset = 0, a_offset = 0): """ xGERC: General rank-1 complex conjugated matrix update """ dtype = check_dtype([x, y, a], ["complex64", "complex128"]) check_vector(x, "x") check_vector(y, "y") check_matrix(a, "a") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastCgerc(CLBlastLayoutRowMajor, m, n, cl_float2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZgerc(CLBlastLayoutRowMajor, m, n, cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXgerc' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Hermitian rank-1 matrix update: CHER/ZHER #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastCher(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZher(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) def her(queue, n, x, a, a_ld, x_inc = 1, alpha = 1.0, lower_triangle = False, x_offset = 0, a_offset = 0): """ xHER: Hermitian rank-1 matrix update """ dtype = check_dtype([x, a], ["complex64", "complex128"]) check_vector(x, "x") check_matrix(a, "a") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastCher(CLBlastLayoutRowMajor, triangle, n, alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZher(CLBlastLayoutRowMajor, triangle, n, alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXher' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Hermitian packed rank-1 matrix update: CHPR/ZHPR #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastChpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset,cl_command_queue* queue, cl_event* event) def hpr(queue, n, x, ap, ap_ld, x_inc = 1, alpha = 1.0, lower_triangle = False, x_offset = 0, ap_offset = 0): """ xHPR: Hermitian packed rank-1 matrix update """ dtype = check_dtype([x, ap], ["complex64", "complex128"]) check_vector(x, "x") check_matrix(ap, "ap") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem ap_buffer = ap.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastChpr(CLBlastLayoutRowMajor, triangle, n, alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZhpr(CLBlastLayoutRowMajor, triangle, n, alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXhpr' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Hermitian rank-2 matrix update: CHER2/ZHER2 #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastCher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) def her2(queue, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_triangle = False, x_offset = 0, y_offset = 0, a_offset = 0): """ xHER2: Hermitian rank-2 matrix update """ dtype = check_dtype([x, y, a], ["complex64", "complex128"]) check_vector(x, "x") check_vector(y, "y") check_matrix(a, "a") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastCher2(CLBlastLayoutRowMajor, triangle, n, cl_float2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZher2(CLBlastLayoutRowMajor, triangle, n, cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXher2' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastChpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset,cl_command_queue* queue, cl_event* event) def hpr2(queue, n, x, y, ap, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_triangle = False, x_offset = 0, y_offset = 0, ap_offset = 0): """ xHPR2: Hermitian packed rank-2 matrix update """ dtype = check_dtype([x, y, ap], ["complex64", "complex128"]) check_vector(x, "x") check_vector(y, "y") check_matrix(ap, "ap") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_mem ap_buffer = ap.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastChpr2(CLBlastLayoutRowMajor, triangle, n, cl_float2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZhpr2(CLBlastLayoutRowMajor, triangle, n, cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXhpr2' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Symmetric rank-1 matrix update: SSYR/DSYR/HSYR #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) def syr(queue, n, x, a, a_ld, x_inc = 1, alpha = 1.0, lower_triangle = False, x_offset = 0, a_offset = 0): """ xSYR: Symmetric rank-1 matrix update """ dtype = check_dtype([x, a], ["float32", "float64", "float16"]) check_vector(x, "x") check_matrix(a, "a") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSsyr(CLBlastLayoutRowMajor, triangle, n, alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDsyr(CLBlastLayoutRowMajor, triangle, n, alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHsyr(CLBlastLayoutRowMajor, triangle, n, val_to_half(alpha), x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsyr' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset,cl_command_queue* queue, cl_event* event) def spr(queue, n, x, ap, ap_ld, x_inc = 1, alpha = 1.0, lower_triangle = False, x_offset = 0, ap_offset = 0): """ xSPR: Symmetric packed rank-1 matrix update """ dtype = check_dtype([x, ap], ["float32", "float64", "float16"]) check_vector(x, "x") check_matrix(ap, "ap") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem ap_buffer = ap.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSspr(CLBlastLayoutRowMajor, triangle, n, alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDspr(CLBlastLayoutRowMajor, triangle, n, alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHspr(CLBlastLayoutRowMajor, triangle, n, val_to_half(alpha), x_buffer, x_offset, x_inc, ap_buffer, ap_offset, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXspr' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld,cl_command_queue* queue, cl_event* event) def syr2(queue, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_triangle = False, x_offset = 0, y_offset = 0, a_offset = 0): """ xSYR2: Symmetric rank-2 matrix update """ dtype = check_dtype([x, y, a], ["float32", "float64", "float16"]) check_vector(x, "x") check_vector(y, "y") check_matrix(a, "a") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSsyr2(CLBlastLayoutRowMajor, triangle, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDsyr2(CLBlastLayoutRowMajor, triangle, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHsyr2(CLBlastLayoutRowMajor, triangle, n, val_to_half(alpha), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsyr2' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset,cl_command_queue* queue, cl_event* event) def spr2(queue, n, x, y, ap, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_triangle = False, x_offset = 0, y_offset = 0, ap_offset = 0): """ xSPR2: Symmetric packed rank-2 matrix update """ dtype = check_dtype([x, y, ap], ["float32", "float64", "float16"]) check_vector(x, "x") check_vector(y, "y") check_matrix(ap, "ap") cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_mem ap_buffer = ap.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSspr2(CLBlastLayoutRowMajor, triangle, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDspr2(CLBlastLayoutRowMajor, triangle, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHspr2(CLBlastLayoutRowMajor, triangle, n, val_to_half(alpha), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXspr2' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) def gemm(queue, m, n, k, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, a_transp = False, b_transp = False, a_offset = 0, b_offset = 0, c_offset = 0): """ xGEMM: General matrix-matrix multiplication """ dtype = check_dtype([a, b, c], ["float32", "float64", "complex64", "complex128", "float16"]) check_matrix(a, "a") check_matrix(b, "b") check_matrix(c, "c") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem b_buffer = b.base_data.int_ptr cdef cl_mem c_buffer = c.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo b_transpose = CLBlastTransposeYes if b_transp else CLBlastTransposeNo cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSgemm(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDgemm(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCgemm(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, cl_float2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZgemm(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, cl_double2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHgemm(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, val_to_half(alpha), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, val_to_half(beta), c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXgemm' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) def symm(queue, m, n, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, right_side = False, lower_triangle = False, a_offset = 0, b_offset = 0, c_offset = 0): """ xSYMM: Symmetric matrix-matrix multiplication """ dtype = check_dtype([a, b, c], ["float32", "float64", "complex64", "complex128", "float16"]) check_matrix(a, "a") check_matrix(b, "b") check_matrix(c, "c") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem b_buffer = b.base_data.int_ptr cdef cl_mem c_buffer = c.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL side = CLBlastSideRight if right_side else CLBlastSideLeft triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSsymm(CLBlastLayoutRowMajor, side, triangle, m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDsymm(CLBlastLayoutRowMajor, side, triangle, m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCsymm(CLBlastLayoutRowMajor, side, triangle, m, n, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, cl_float2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZsymm(CLBlastLayoutRowMajor, side, triangle, m, n, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, cl_double2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHsymm(CLBlastLayoutRowMajor, side, triangle, m, n, val_to_half(alpha), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, val_to_half(beta), c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsymm' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Hermitian matrix-matrix multiplication: CHEMM/ZHEMM #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastChemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) def hemm(queue, m, n, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, right_side = False, lower_triangle = False, a_offset = 0, b_offset = 0, c_offset = 0): """ xHEMM: Hermitian matrix-matrix multiplication """ dtype = check_dtype([a, b, c], ["complex64", "complex128"]) check_matrix(a, "a") check_matrix(b, "b") check_matrix(c, "c") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem b_buffer = b.base_data.int_ptr cdef cl_mem c_buffer = c.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL side = CLBlastSideRight if right_side else CLBlastSideLeft triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastChemm(CLBlastLayoutRowMajor, side, triangle, m, n, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, cl_float2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZhemm(CLBlastLayoutRowMajor, side, triangle, m, n, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, cl_double2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXhemm' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) def syrk(queue, n, k, a, c, a_ld, c_ld, alpha = 1.0, beta = 0.0, lower_triangle = False, a_transp = False, a_offset = 0, c_offset = 0): """ xSYRK: Rank-K update of a symmetric matrix """ dtype = check_dtype([a, c], ["float32", "float64", "complex64", "complex128", "float16"]) check_matrix(a, "a") check_matrix(c, "c") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem c_buffer = c.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSsyrk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, alpha, a_buffer, a_offset, a_ld, beta, c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDsyrk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, alpha, a_buffer, a_offset, a_ld, beta, c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCsyrk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, cl_float2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZsyrk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, cl_double2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHsyrk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, val_to_half(alpha), a_buffer, a_offset, a_ld, val_to_half(beta), c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsyrk' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Rank-K update of a hermitian matrix: CHERK/ZHERK #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastCherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) def herk(queue, n, k, a, c, a_ld, c_ld, alpha = 1.0, beta = 0.0, lower_triangle = False, a_transp = False, a_offset = 0, c_offset = 0): """ xHERK: Rank-K update of a hermitian matrix """ dtype = check_dtype([a, c], ["complex64", "complex128"]) check_matrix(a, "a") check_matrix(c, "c") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem c_buffer = c.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastCherk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, alpha, a_buffer, a_offset, a_ld, beta, c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZherk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, alpha, a_buffer, a_offset, a_ld, beta, c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXherk' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) def syr2k(queue, n, k, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, lower_triangle = False, ab_transp = False, a_offset = 0, b_offset = 0, c_offset = 0): """ xSYR2K: Rank-2K update of a symmetric matrix """ dtype = check_dtype([a, b, c], ["float32", "float64", "complex64", "complex128", "float16"]) check_matrix(a, "a") check_matrix(b, "b") check_matrix(c, "c") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem b_buffer = b.base_data.int_ptr cdef cl_mem c_buffer = c.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper ab_transpose = CLBlastTransposeYes if ab_transp else CLBlastTransposeNo cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSsyr2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDsyr2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCsyr2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, cl_float2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZsyr2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, cl_double2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHsyr2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, val_to_half(alpha), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, val_to_half(beta), c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsyr2k' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Rank-2K update of a hermitian matrix: CHER2K/ZHER2K #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastCher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld,cl_command_queue* queue, cl_event* event) def her2k(queue, n, k, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, lower_triangle = False, ab_transp = False, a_offset = 0, b_offset = 0, c_offset = 0): """ xHER2K: Rank-2K update of a hermitian matrix """ dtype = check_dtype([a, b, c], ["complex64", "complex128"]) check_matrix(a, "a") check_matrix(b, "b") check_matrix(c, "c") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem b_buffer = b.base_data.int_ptr cdef cl_mem c_buffer = c.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper ab_transpose = CLBlastTransposeYes if ab_transp else CLBlastTransposeNo cdef CLBlastStatusCode err if dtype == np.dtype("complex64"): err = CLBlastCher2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZher2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXher2k' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastStrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld,cl_command_queue* queue, cl_event* event) def trmm(queue, m, n, a, b, a_ld, b_ld, alpha = 1.0, right_side = False, lower_triangle = False, a_transp = False, unit_diagonal = False, a_offset = 0, b_offset = 0): """ xTRMM: Triangular matrix-matrix multiplication """ dtype = check_dtype([a, b], ["float32", "float64", "complex64", "complex128", "float16"]) check_matrix(a, "a") check_matrix(b, "b") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem b_buffer = b.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL side = CLBlastSideRight if right_side else CLBlastSideLeft triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo diagonal = CLBlastDiagonalUnit if unit_diagonal else CLBlastDiagonalNonUnit cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastStrmm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDtrmm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCtrmm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZtrmm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHtrmm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, val_to_half(alpha), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXtrmm' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld,cl_command_queue* queue, cl_event* event) def trsm(queue, m, n, a, b, a_ld, b_ld, alpha = 1.0, right_side = False, lower_triangle = False, a_transp = False, unit_diagonal = False, a_offset = 0, b_offset = 0): """ xTRSM: Solves a triangular system of equations """ dtype = check_dtype([a, b], ["float32", "float64", "complex64", "complex128"]) check_matrix(a, "a") check_matrix(b, "b") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem b_buffer = b.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL side = CLBlastSideRight if right_side else CLBlastSideLeft triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo diagonal = CLBlastDiagonalUnit if unit_diagonal else CLBlastDiagonalNonUnit cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastStrsm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDtrsm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCtrsm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZtrsm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXtrsm' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, const float *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, const double *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, const cl_float2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, const cl_double2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, const cl_half *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event) def axpyBatched(queue, n, x, y, alphas, x_offsets, y_offsets, x_inc = 1, y_inc = 1): """ xAXPYBATCHED: Batched version of AXPY """ dtype = check_dtype([x, y], ["float32", "float64", "complex64", "complex128", "float16"]) check_vector(x, "x") check_vector(y, "y") if len(x_offsets) != len(y_offsets) != len(alphas): raise RuntimeError("PyCLBlast: 'CLBlastXaxpyBatched' failed: length of batch-sized arguments x_offsets, y_offsets, alphas should be equal") batch_count = len(x_offsets) cdef size_t *x_offsets_c = PyMem_Malloc(batch_count * sizeof(size_t)) for i in range(batch_count): x_offsets_c[i] = x_offsets[i] cdef size_t *y_offsets_c = PyMem_Malloc(batch_count * sizeof(size_t)) for i in range(batch_count): y_offsets_c[i] = y_offsets[i] cdef void *alphas_c = PyMem_Malloc(batch_count * sizeof(dtype_size[dtype])) for i in range(batch_count): if dtype == np.dtype("float32"): (alphas_c)[i] = alphas[i] elif dtype == np.dtype("float64"): (alphas_c)[i] = alphas[i] elif dtype == np.dtype("complex64"): (alphas_c)[i] = cl_float2(x=alphas[i].real,y=alphas[i].imag) elif dtype == np.dtype("complex128"): (alphas_c)[i] = cl_double2(x=alphas[i].real,y=alphas[i].imag) elif dtype == np.dtype("float16"): (alphas_c)[i] = val_to_half(alphas[i]) cdef cl_mem x_buffer = x.base_data.int_ptr cdef cl_mem y_buffer = y.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSaxpyBatched(n, alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDaxpyBatched(n, alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCaxpyBatched(n, alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZaxpyBatched(n, alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHaxpyBatched(n, alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) PyMem_Free(x_offsets_c) PyMem_Free(y_offsets_c) PyMem_Free(alphas_c) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXaxpyBatched' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const float *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const double *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_float2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_double2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_half *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event) def gemmBatched(queue, m, n, k, a, b, c, alphas, betas, a_ld, b_ld, c_ld, a_offsets, b_offsets, c_offsets, a_transp = False, b_transp = False): """ xGEMMBATCHED: Batched version of GEMM """ dtype = check_dtype([a, b, c], ["float32", "float64", "complex64", "complex128", "float16"]) check_matrix(a, "a") check_matrix(b, "b") check_matrix(c, "c") if len(a_offsets) != len(b_offsets) != len(c_offsets) != len(alphas) != len(betas): raise RuntimeError("PyCLBlast: 'CLBlastXgemmBatched' failed: length of batch-sized arguments a_offsets, b_offsets, c_offsets, alphas, betas should be equal") batch_count = len(a_offsets) cdef size_t *a_offsets_c = PyMem_Malloc(batch_count * sizeof(size_t)) for i in range(batch_count): a_offsets_c[i] = a_offsets[i] cdef size_t *b_offsets_c = PyMem_Malloc(batch_count * sizeof(size_t)) for i in range(batch_count): b_offsets_c[i] = b_offsets[i] cdef size_t *c_offsets_c = PyMem_Malloc(batch_count * sizeof(size_t)) for i in range(batch_count): c_offsets_c[i] = c_offsets[i] cdef void *alphas_c = PyMem_Malloc(batch_count * sizeof(dtype_size[dtype])) for i in range(batch_count): if dtype == np.dtype("float32"): (alphas_c)[i] = alphas[i] elif dtype == np.dtype("float64"): (alphas_c)[i] = alphas[i] elif dtype == np.dtype("complex64"): (alphas_c)[i] = cl_float2(x=alphas[i].real,y=alphas[i].imag) elif dtype == np.dtype("complex128"): (alphas_c)[i] = cl_double2(x=alphas[i].real,y=alphas[i].imag) elif dtype == np.dtype("float16"): (alphas_c)[i] = val_to_half(alphas[i]) cdef void *betas_c = PyMem_Malloc(batch_count * sizeof(dtype_size[dtype])) for i in range(batch_count): if dtype == np.dtype("float32"): (betas_c)[i] = betas[i] elif dtype == np.dtype("float64"): (betas_c)[i] = betas[i] elif dtype == np.dtype("complex64"): (betas_c)[i] = cl_float2(x=betas[i].real,y=betas[i].imag) elif dtype == np.dtype("complex128"): (betas_c)[i] = cl_double2(x=betas[i].real,y=betas[i].imag) elif dtype == np.dtype("float16"): (betas_c)[i] = val_to_half(betas[i]) cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem b_buffer = b.base_data.int_ptr cdef cl_mem c_buffer = c.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo b_transpose = CLBlastTransposeYes if b_transp else CLBlastTransposeNo cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) PyMem_Free(a_offsets_c) PyMem_Free(b_offsets_c) PyMem_Free(c_offsets_c) PyMem_Free(alphas_c) PyMem_Free(betas_c) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXgemmBatched' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED #################################################################################################### cdef extern from "clblast_c.h": CLBlastStatusCode CLBlastSgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event) def gemmStridedBatched(queue, m, n, k, batch_count, a, b, c, a_ld, b_ld, c_ld, a_stride, b_stride, c_stride, alpha = 1.0, beta = 0.0, a_transp = False, b_transp = False, a_offset = 0, b_offset = 0, c_offset = 0): """ xGEMMSTRIDEDBATCHED: StridedBatched version of GEMM """ dtype = check_dtype([a, b, c], ["float32", "float64", "complex64", "complex128", "float16"]) check_matrix(a, "a") check_matrix(b, "b") check_matrix(c, "c") cdef cl_mem a_buffer = a.base_data.int_ptr cdef cl_mem b_buffer = b.base_data.int_ptr cdef cl_mem c_buffer = c.base_data.int_ptr cdef cl_command_queue command_queue = queue.int_ptr cdef cl_event event = NULL a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo b_transpose = CLBlastTransposeYes if b_transp else CLBlastTransposeNo cdef CLBlastStatusCode err if dtype == np.dtype("float32"): err = CLBlastSgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, beta, c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event) elif dtype == np.dtype("float64"): err = CLBlastDgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, beta, c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event) elif dtype == np.dtype("complex64"): err = CLBlastCgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, cl_float2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event) elif dtype == np.dtype("complex128"): err = CLBlastZgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, cl_double2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event) elif dtype == np.dtype("float16"): err = CLBlastHgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, val_to_half(alpha), a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, val_to_half(beta), c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXgemmStridedBatched' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(event) #################################################################################################### # Overrides the parameters #################################################################################################### cdef extern from "clblast_c.h": ctypedef struct _cl_device_id: pass ctypedef _cl_device_id* cl_device_id CLBlastStatusCode CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name, const CLBlastPrecision precision, const size_t num_parameters, const char** parameters_names, const size_t* parameters_values) def override_parameters(device, kernel_name, precision, parameters): """ Override the current parameters for the given kernel, on this device, with this precision. """ cdef cl_device_id device_id = device.int_ptr # read the parameters dictionary into names/values arrays, for use in CLBlastOverrideParameters cdef size_t n = len(parameters) cdef const char **parameter_names = PyMem_Malloc(n * sizeof(char*)) cdef size_t *parameter_values = PyMem_Malloc(n * sizeof(size_t)) if not (parameter_names or parameter_values): raise MemoryError() for i, (k, v) in enumerate(parameters.items()): parameter_names[i] = strdup(k.encode('ascii')) parameter_values[i] = v # call the underlying API err = CLBlastOverrideParameters(device_id, kernel_name.encode('ascii'), precision, n, parameter_names, parameter_values) if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'OverrideParameters' failed: %s" % get_status_message(err)) # tidy up: PyMem_Free(parameter_names) PyMem_Free(parameter_values) #################################################################################################### CLBlast-1.6.3/src/pyclblast/test/000077500000000000000000000000001463263031500165535ustar00rootroot00000000000000CLBlast-1.6.3/src/pyclblast/test/__init__.py000066400000000000000000000000001463263031500206520ustar00rootroot00000000000000CLBlast-1.6.3/src/pyclblast/test/test_pyclblast.py000066400000000000000000000067601463263031500221720ustar00rootroot00000000000000 #################################################################################################### # This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. # # Author(s): # Cedric Nugteren # # This file test PyCLBlast: the Python interface to CLBlast. It is not exhaustive. For full testing # it is recommended to run the regular CLBlast tests, this is just a small smoke test. # #################################################################################################### import unittest import numpy as np import pyopencl as cl from pyopencl.array import Array import pyclblast class TestPyCLBlast(unittest.TestCase): @staticmethod def setup(sizes, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) host_arrays, device_arrays = [], [] for size in sizes: numpy_array = np.random.rand(*size).astype(dtype=dtype) opencl_array = Array(queue, numpy_array.shape, numpy_array.dtype) opencl_array.set(numpy_array) host_arrays.append(numpy_array) device_arrays.append(opencl_array) queue.finish() return queue, host_arrays, device_arrays def test_axpy(self): for dtype in ["float32", "complex64"]: for alpha in [1.0, 3.1]: for n in [1, 7, 32]: queue, h, d = self.setup([(n,), (n,)], dtype=dtype) pyclblast.axpy(queue, n, d[0], d[1], alpha=alpha) queue.finish() result = d[1].get() reference = alpha * h[0] + h[1] for i in range(n): self.assertAlmostEqual(reference[i], result[i], places=3) def test_gemv(self): for dtype in ["float32", "complex64"]: for beta in [1.0]: for alpha in [1.0, 3.1]: for m in [1, 7, 32]: for n in [1, 7, 32]: queue, h, d = self.setup([(m, n), (n,), (m,)], dtype=dtype) pyclblast.gemv(queue, m, n, d[0], d[1], d[2], a_ld=n, alpha=alpha, beta=beta) queue.finish() result = d[2].get() reference = alpha * np.dot(h[0], h[1]) + beta * h[2] for i in range(m): self.assertAlmostEqual(reference[i], result[i], places=3) def test_gemm(self): for dtype in ["float32", "complex64"]: for beta in [1.0]: for alpha in [1.0, 3.1]: for m in [1, 7, 32]: for n in [1, 7, 32]: for k in [1, 7, 32]: queue, h, d = self.setup([(m, k), (k, n), (m, n)], dtype=dtype) pyclblast.gemm(queue, m, n, k, d[0], d[1], d[2], a_ld=k, b_ld=n, c_ld=n, alpha=alpha, beta=beta) queue.finish() result = d[2].get() reference = alpha * np.dot(h[0], h[1]) + beta * h[2] for i in range(m): for j in range(n): self.assertAlmostEqual(reference[i, j], result[i, j], places=3) CLBlast-1.6.3/src/routine.cpp000066400000000000000000000145001463263031500157700ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Routine base class (see the header for information about the class). // // ================================================================================================= #include #include #include #include #include "routine.hpp" namespace clblast { // ================================================================================================= // For each kernel this map contains a list of routines it is used in const std::vector Routine::routines_axpy = {"AXPY", "COPY", "SCAL", "SWAP"}; const std::vector Routine::routines_dot = {"AMAX", "ASUM", "DOT", "DOTC", "DOTU", "MAX", "MIN", "NRM2", "SUM"}; const std::vector Routine::routines_ger = {"GER", "GERC", "GERU", "HER", "HER2", "HPR", "HPR2", "SPR", "SPR2", "SYR", "SYR2"}; const std::vector Routine::routines_gemv = {"GBMV", "GEMV", "HBMV", "HEMV", "HPMV", "SBMV", "SPMV", "SYMV", "TMBV", "TPMV", "TRMV", "TRSV"}; const std::vector Routine::routines_gemm = {"GEMM", "HEMM", "SYMM", "TRMM"}; const std::vector Routine::routines_gemm_syrk = {"GEMM", "HEMM", "HER2K", "HERK", "SYMM", "SYR2K", "SYRK", "TRMM", "TRSM"}; const std::vector Routine::routines_trsm = {"TRSM"}; const std::unordered_map> Routine::routines_by_kernel = { {"Xaxpy", routines_axpy}, {"Xdot", routines_dot}, {"Xgemv", routines_gemv}, {"XgemvFast", routines_gemv}, {"XgemvFastRot", routines_gemv}, {"Xtrsv", routines_gemv}, {"Xger", routines_ger}, {"Copy", routines_gemm_syrk}, {"Pad", routines_gemm_syrk}, {"Transpose", routines_gemm_syrk}, {"Padtranspose", routines_gemm_syrk}, {"Xgemm", routines_gemm_syrk}, {"XgemmDirect", routines_gemm}, {"GemmRoutine", routines_gemm}, {"Invert", routines_trsm}, }; // ================================================================================================= // The constructor does all heavy work, errors are returned as exceptions Routine::Routine(Queue &queue, EventPointer event, const std::string &name, const std::vector &kernel_names, const Precision precision, const std::vector &userDatabase, std::initializer_list source): precision_(precision), routine_name_(name), kernel_names_(kernel_names), queue_(queue), event_(event), context_(queue_.GetContext()), device_(queue_.GetDevice()), db_(kernel_names) { InitDatabase(device_, kernel_names, precision, userDatabase, db_); InitProgram(source); } void Routine::InitProgram(std::initializer_list source) { // Determines the identifier for this particular routine call auto routine_info = routine_name_; for (const auto &kernel_name : kernel_names_) { routine_info += "_" + kernel_name + db_(kernel_name).GetValuesString(); } log_debug(routine_info); // Queries the cache to see whether or not the program (context-specific) is already there bool has_program; program_ = ProgramCache::Instance().Get(ProgramKeyRef{ context_(), device_(), precision_, routine_info }, &has_program); if (has_program) { return; } // Sets the build options from an environmental variable (if set) auto options = std::vector(); const auto environment_variable = std::getenv("CLBLAST_BUILD_OPTIONS"); if (environment_variable != nullptr) { options.push_back(std::string(environment_variable)); } // Queries the cache to see whether or not the binary (device-specific) is already there. If it // is, a program is created and stored in the cache const auto device_name = GetDeviceName(device_); const auto platform_id = device_.PlatformID(); bool has_binary; auto binary = BinaryCache::Instance().Get(BinaryKeyRef{platform_id, precision_, routine_info, device_name }, &has_binary); if (has_binary) { program_ = std::make_shared(device_, context_, binary); SetOpenCLKernelStandard(device_, options); program_->Build(device_, options); ProgramCache::Instance().Store(ProgramKey{ context_(), device_(), precision_, routine_info }, std::shared_ptr{program_}); return; } // Otherwise, the kernel will be compiled and program will be built. Both the binary and the // program will be added to the cache. // Inspects whether or not FP64 is supported in case of double precision if ((precision_ == Precision::kDouble && !PrecisionSupported(device_)) || (precision_ == Precision::kComplexDouble && !PrecisionSupported(device_))) { throw RuntimeErrorCode(StatusCode::kNoDoublePrecision); } // As above, but for FP16 (half precision) if (precision_ == Precision::kHalf && !PrecisionSupported(device_)) { throw RuntimeErrorCode(StatusCode::kNoHalfPrecision); } // Collects the parameters for this device in the form of defines auto source_string = std::string{""}; for (const auto &kernel_name : kernel_names_) { source_string += db_(kernel_name).GetDefines(); } // Adds routine-specific code to the constructed source string for (const char *s: source) { source_string += s; } // Completes the source and compiles the kernel program_ = CompileFromSource(source_string, precision_, routine_name_, device_, context_, options, 0); // Store the compiled binary and program in the cache BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name}, program_->GetIR()); ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info}, std::shared_ptr{program_}); } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routine.hpp000066400000000000000000000106031463263031500157750ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements all the basic functionality for the BLAS routines. This class serves as a // base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as // compiling the OpenCL kernel, connecting to the database, etc. // // ================================================================================================= #ifndef CLBLAST_ROUTINE_H_ #define CLBLAST_ROUTINE_H_ #include #include #include #include "utilities/utilities.hpp" #include "cache.hpp" #include "utilities/buffer_test.hpp" #include "database/database.hpp" #include "routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class class Routine { public: // Initializes db_, fetching cached database or building one static void InitDatabase(const Device &device, const std::vector &kernel_names, const Precision precision, const std::vector &userDatabase, Databases &db) { const auto platform_id = device.PlatformID(); for (const auto &kernel_name : kernel_names) { // Queries the cache to see whether or not the kernel parameter database is already there bool has_db; db(kernel_name) = DatabaseCache::Instance().Get(DatabaseKeyRef{platform_id, device(), precision, kernel_name}, &has_db); if (has_db) { continue; } // Builds the parameter database for this device and routine set and stores it in the cache log_debug("Searching database for kernel '" + kernel_name + "'"); db(kernel_name) = Database(device, kernel_name, precision, userDatabase); DatabaseCache::Instance().Store(DatabaseKey{platform_id, device(), precision, kernel_name}, Database{db(kernel_name)}); } } // Base class constructor. The user database is an optional extra database to override the // built-in database. // All heavy preparation work is done inside this constructor. // NOTE: the caller must provide the same userDatabase for each combination of device, precision // and routine list, otherwise the caching logic will break. explicit Routine(Queue &queue, EventPointer event, const std::string &name, const std::vector &routines, const Precision precision, const std::vector &userDatabase, std::initializer_list source); // List of kernel-routine look-ups static const std::vector routines_axpy; static const std::vector routines_dot; static const std::vector routines_ger; static const std::vector routines_gemv; static const std::vector routines_gemm; static const std::vector routines_gemm_syrk; static const std::vector routines_trsm; static const std::unordered_map> routines_by_kernel; private: // Initializes program_, fetching cached program or building one void InitProgram(std::initializer_list source); protected: // Non-static variable for the precision const Precision precision_; // The routine's name and the corresponding kernels const std::string routine_name_; const std::vector kernel_names_; // The OpenCL objects, accessible only from derived classes Queue queue_; EventPointer event_; const Context context_; const Device device_; // Compiled program (either retrieved from cache or compiled in slow path) std::shared_ptr program_; // Connection to the database for all the device-specific parameters Databases db_; }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINE_H_ #endif CLBlast-1.6.3/src/routines/000077500000000000000000000000001463263031500154475ustar00rootroot00000000000000CLBlast-1.6.3/src/routines/common.cpp000066400000000000000000000204111463263031500174410ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the common routine functions (see the header for more information). // // ================================================================================================= #include #include #include #include "routines/common.hpp" namespace clblast { // ================================================================================================= // Enqueues a kernel, waits for completion, and checks for errors void RunKernel(Kernel &kernel, Queue &queue, const Device &device, std::vector global, const std::vector &local, EventPointer event, const std::vector &waitForEvents) { if (!local.empty()) { // Tests for validity of the local thread sizes if (local.size() > device.MaxWorkItemDimensions()) { throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions); } const auto max_work_item_sizes = device.MaxWorkItemSizes(); for (auto i=size_t{0}; i max_work_item_sizes[i]) { throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim); } } auto local_size = size_t{1}; for (auto &item: local) { local_size *= item; } if (local_size > device.MaxWorkGroupSize()) { throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal, ToString(local_size) + " is larger than " + ToString(device.MaxWorkGroupSize())); } // Make sure the global thread sizes are at least equal to the local sizes for (auto i=size_t{0}; i(elapsed_time).count(); printf("[DEBUG] Completed kernel in %.2lf ms\n", timing); #endif } // ================================================================================================= // Sets all elements of a matrix to a constant value template void FillMatrix(Queue &queue, const Device &device, const std::shared_ptr program, EventPointer event, const std::vector &waitForEvents, const size_t m, const size_t n, const size_t ld, const size_t offset, const Buffer &dest, const T constant_value, const size_t local_size) { auto kernel = Kernel(program, "FillMatrix"); kernel.SetArgument(0, static_cast(m)); kernel.SetArgument(1, static_cast(n)); kernel.SetArgument(2, static_cast(ld)); kernel.SetArgument(3, static_cast(offset)); kernel.SetArgument(4, dest()); kernel.SetArgument(5, GetRealArg(constant_value)); auto local = std::vector{local_size, 1}; auto global = std::vector{Ceil(m, local_size), n}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } // Compiles the above function template void FillMatrix(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const half, const size_t); template void FillMatrix(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const float, const size_t); template void FillMatrix(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const double, const size_t); template void FillMatrix(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const float2, const size_t); template void FillMatrix(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const double2, const size_t); // Sets all elements of a vector to a constant value template void FillVector(Queue &queue, const Device &device, const std::shared_ptr program, EventPointer event, const std::vector &waitForEvents, const size_t n, const size_t inc, const size_t offset, const Buffer &dest, const T constant_value, const size_t local_size) { auto kernel = Kernel(program, "FillVector"); kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, static_cast(inc)); kernel.SetArgument(2, static_cast(offset)); kernel.SetArgument(3, dest()); kernel.SetArgument(4, GetRealArg(constant_value)); auto local = std::vector{local_size}; auto global = std::vector{Ceil(n, local_size)}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } // Compiles the above function template void FillVector(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const half, const size_t); template void FillVector(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const float, const size_t); template void FillVector(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const double, const size_t); template void FillVector(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const float2, const size_t); template void FillVector(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const double2, const size_t); // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/common.hpp000066400000000000000000000325621463263031500174600ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains all the interfaces to common kernels, such as copying, padding, and // transposing a matrix. These functions are templated and thus header-only. This file also contains // other common functions to routines, such as a function to launch a kernel. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_COMMON_H_ #define CLBLAST_ROUTINES_COMMON_H_ #include #include #include "utilities/utilities.hpp" #include "utilities/compile.hpp" #include "database/database.hpp" namespace clblast { // ================================================================================================= // Enqueues a kernel, waits for completion, and checks for errors void RunKernel(Kernel &kernel, Queue &queue, const Device &device, std::vector global, const std::vector &local, EventPointer event, const std::vector &waitForEvents = {}); // ================================================================================================= // Sets all elements of a matrix to a constant value template void FillMatrix(Queue &queue, const Device &device, const std::shared_ptr program, EventPointer event, const std::vector &waitForEvents, const size_t m, const size_t n, const size_t ld, const size_t offset, const Buffer &dest, const T constant_value, const size_t local_size); // Sets all elements of a vector to a constant value template void FillVector(Queue &queue, const Device &device, const std::shared_ptr program, EventPointer event, const std::vector &waitForEvents, const size_t n, const size_t inc, const size_t offset, const Buffer &dest, const T constant_value, const size_t local_size); // ================================================================================================= // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able // to write to symmetric and triangular matrices through optional arguments. template void PadCopyTransposeMatrix(Queue &queue, const Device &device, const Databases &db, EventPointer event, const std::vector &waitForEvents, const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const Buffer &src, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const size_t dest_offset, const Buffer &dest, const T alpha, const std::shared_ptr program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const bool upper = false, const bool lower = false, const bool diagonal_imag_zero = false) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && (upper == false) && (lower == false) && (diagonal_imag_zero == false); // Determines the right kernel auto kernel_name = std::string{}; auto pad_kernel = false; if (do_transpose) { if (use_fast_kernel && IsMultiple(src_ld, db["TRA_WPT"]) && IsMultiple(src_one, db["TRA_WPT"]*db["TRA_DIM"]) && IsMultiple(src_two, db["TRA_WPT"]*db["TRA_DIM"])) { kernel_name = "TransposeMatrixFast"; } else { use_fast_kernel = false; pad_kernel = (do_pad || do_conjugate); kernel_name = (pad_kernel) ? "TransposePadMatrix" : "TransposeMatrix"; } } else { if (use_fast_kernel && IsMultiple(src_ld, db["COPY_VW"]) && IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) && IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) { kernel_name = "CopyMatrixFast"; } else { use_fast_kernel = false; pad_kernel = do_pad; kernel_name = (pad_kernel) ? "CopyPadMatrix" : "CopyMatrix"; } } // Retrieves the kernel from the compiled binary auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast(src_ld)); kernel.SetArgument(1, src()); kernel.SetArgument(2, dest()); kernel.SetArgument(3, GetRealArg(alpha)); } else { kernel.SetArgument(0, static_cast(src_one)); kernel.SetArgument(1, static_cast(src_two)); kernel.SetArgument(2, static_cast(src_ld)); kernel.SetArgument(3, static_cast(src_offset)); kernel.SetArgument(4, src()); kernel.SetArgument(5, static_cast(dest_one)); kernel.SetArgument(6, static_cast(dest_two)); kernel.SetArgument(7, static_cast(dest_ld)); kernel.SetArgument(8, static_cast(dest_offset)); kernel.SetArgument(9, dest()); kernel.SetArgument(10, GetRealArg(alpha)); if (pad_kernel) { kernel.SetArgument(11, static_cast(do_conjugate)); } else { kernel.SetArgument(11, static_cast(upper)); kernel.SetArgument(12, static_cast(lower)); kernel.SetArgument(13, static_cast(diagonal_imag_zero)); } } // Launches the kernel and returns the error code. Uses global and local thread sizes based on // parameters in the database. if (do_transpose) { if (use_fast_kernel) { const auto global = std::vector{ dest_one / db["TRA_WPT"], dest_two / db["TRA_WPT"] }; const auto local = std::vector{db["TRA_DIM"], db["TRA_DIM"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector{ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]) }; const auto local = std::vector{db["PADTRA_TILE"], db["PADTRA_TILE"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } } else { if (use_fast_kernel) { const auto global = std::vector{ dest_one / db["COPY_VW"], dest_two / db["COPY_WPT"] }; const auto local = std::vector{db["COPY_DIMX"], db["COPY_DIMY"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector{ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]) }; const auto local = std::vector{db["PAD_DIMX"], db["PAD_DIMY"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } } } // Batched version of the above template void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device, const Databases &db, EventPointer event, const std::vector &waitForEvents, const size_t src_one, const size_t src_two, const size_t src_ld, const Buffer &src_offsets, const Buffer &src, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const Buffer &dest_offsets, const Buffer &dest, const std::shared_ptr program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const size_t batch_count) { // Determines the right kernel auto kernel_name = std::string{}; if (do_transpose) { kernel_name = (do_pad) ? "TransposePadMatrixBatched" : "TransposeMatrixBatched"; } else { kernel_name = (do_pad) ? "CopyPadMatrixBatched" : "CopyMatrixBatched"; } // Retrieves the kernel from the compiled binary auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(src_one)); kernel.SetArgument(1, static_cast(src_two)); kernel.SetArgument(2, static_cast(src_ld)); kernel.SetArgument(3, src_offsets()); kernel.SetArgument(4, src()); kernel.SetArgument(5, static_cast(dest_one)); kernel.SetArgument(6, static_cast(dest_two)); kernel.SetArgument(7, static_cast(dest_ld)); kernel.SetArgument(8, dest_offsets()); kernel.SetArgument(9, dest()); if (do_pad) { kernel.SetArgument(10, static_cast(do_conjugate)); } // Launches the kernel and returns the error code. Uses global and local thread sizes based on // parameters in the database. if (do_transpose) { const auto global = std::vector{ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]), batch_count }; const auto local = std::vector{db["PADTRA_TILE"], db["PADTRA_TILE"], 1}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector{ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]), batch_count }; const auto local = std::vector{db["PAD_DIMX"], db["PAD_DIMY"], 1}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } } // Batched version of the above template void PadCopyTransposeMatrixStridedBatched(Queue &queue, const Device &device, const Databases &db, EventPointer event, const std::vector &waitForEvents, const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const size_t src_stride, const Buffer &src, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const size_t dest_offset, const size_t dest_stride, const Buffer &dest, const std::shared_ptr program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const size_t batch_count) { // Determines the right kernel auto kernel_name = std::string{}; if (do_transpose) { kernel_name = (do_pad) ? "TransposePadMatrixStridedBatched" : "TransposeMatrixStridedBatched"; } else { kernel_name = (do_pad) ? "CopyPadMatrixStridedBatched" : "CopyMatrixStridedBatched"; } // Retrieves the kernel from the compiled binary auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(src_one)); kernel.SetArgument(1, static_cast(src_two)); kernel.SetArgument(2, static_cast(src_ld)); kernel.SetArgument(3, static_cast(src_offset)); kernel.SetArgument(4, static_cast(src_stride)); kernel.SetArgument(5, src()); kernel.SetArgument(6, static_cast(dest_one)); kernel.SetArgument(7, static_cast(dest_two)); kernel.SetArgument(8, static_cast(dest_ld)); kernel.SetArgument(9, static_cast(dest_offset)); kernel.SetArgument(10, static_cast(dest_stride)); kernel.SetArgument(11, dest()); if (do_pad) { kernel.SetArgument(12, static_cast(do_conjugate)); } // Launches the kernel and returns the error code. Uses global and local thread sizes based on // parameters in the database. if (do_transpose) { const auto global = std::vector{ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]), batch_count }; const auto local = std::vector{db["PADTRA_TILE"], db["PADTRA_TILE"], 1}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector{ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]), batch_count }; const auto local = std::vector{db["PAD_DIMX"], db["PAD_DIMY"], 1}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } } // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_COMMON_H_ #endif CLBlast-1.6.3/src/routines/level1/000077500000000000000000000000001463263031500166375ustar00rootroot00000000000000CLBlast-1.6.3/src/routines/level1/xamax.cpp000066400000000000000000000067571463263031500205000ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xamax class (see the header for information about the class). // // ================================================================================================= #include "routines/level1/xamax.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xamax::Xamax(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xdot"}, PrecisionValue(), {}, { #include "../../kernels/level1/xamax.opencl" }) { } // ================================================================================================= // The main routine template void Xamax::DoAmax(const size_t n, const Buffer &imax_buffer, const size_t imax_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity TestVectorX(n, x_buffer, x_offset, x_inc); TestVectorIndex(1, imax_buffer, imax_offset); // Retrieves the Xamax kernels from the compiled binary auto kernel1 = Kernel(program_, "Xamax"); auto kernel2 = Kernel(program_, "XamaxEpilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; auto temp_buffer1 = Buffer(context_, temp_size); auto temp_buffer2 = Buffer(context_, temp_size); // Sets the kernel arguments kernel1.SetArgument(0, static_cast(n)); kernel1.SetArgument(1, x_buffer()); kernel1.SetArgument(2, static_cast(x_offset)); kernel1.SetArgument(3, static_cast(x_inc)); kernel1.SetArgument(4, temp_buffer1()); kernel1.SetArgument(5, temp_buffer2()); // Event waiting list auto eventWaitList = std::vector(); // Launches the main kernel auto global1 = std::vector{db_["WGS1"]*temp_size}; auto local1 = std::vector{db_["WGS1"]}; auto kernelEvent = Event(); RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); eventWaitList.push_back(kernelEvent); // Sets the arguments for the epilogue kernel kernel2.SetArgument(0, temp_buffer1()); kernel2.SetArgument(1, temp_buffer2()); kernel2.SetArgument(2, imax_buffer()); kernel2.SetArgument(3, static_cast(imax_offset)); // Launches the epilogue kernel auto global2 = std::vector{db_["WGS2"]}; auto local2 = std::vector{db_["WGS2"]}; RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= // Compiles the templated class template class Xamax; template class Xamax; template class Xamax; template class Xamax; template class Xamax; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level1/xamax.hpp000066400000000000000000000026571463263031500205000ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xamax routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XAMAX_H_ #define CLBLAST_ROUTINES_XAMAX_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xamax: public Routine { public: // Constructor Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX"); // Templated-precision implementation of the routine void DoAmax(const size_t n, const Buffer &imax_buffer, const size_t imax_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XAMAX_H_ #endif CLBlast-1.6.3/src/routines/level1/xamin.hpp000066400000000000000000000034061463263031500204670ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xamin routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XAMIN_H_ #define CLBLAST_ROUTINES_XAMIN_H_ #include "routine.hpp" #include "routines/level1/xamax.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xamin: public Xamax { public: // Members and methods from the base class using Xamax::DoAmax; // Constructor Xamin(Queue &queue, EventPointer event, const std::string &name = "AMIN"): Xamax(queue, event, name) { } // Forwards to the regular max-absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. void DoAmin(const size_t n, const Buffer &imin_buffer, const size_t imin_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); } }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XAMIN_H_ #endif CLBlast-1.6.3/src/routines/level1/xasum.cpp000066400000000000000000000065151463263031500205070ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xasum class (see the header for information about the class). // // ================================================================================================= #include "routines/level1/xasum.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xasum::Xasum(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xdot"}, PrecisionValue(), {}, { #include "../../kernels/level1/xasum.opencl" }) { } // ================================================================================================= // The main routine template void Xasum::DoAsum(const size_t n, const Buffer &asum_buffer, const size_t asum_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity TestVectorX(n, x_buffer, x_offset, x_inc); TestVectorScalar(1, asum_buffer, asum_offset); // Retrieves the Xasum kernels from the compiled binary auto kernel1 = Kernel(program_, "Xasum"); auto kernel2 = Kernel(program_, "XasumEpilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; auto temp_buffer = Buffer(context_, temp_size); // Sets the kernel arguments kernel1.SetArgument(0, static_cast(n)); kernel1.SetArgument(1, x_buffer()); kernel1.SetArgument(2, static_cast(x_offset)); kernel1.SetArgument(3, static_cast(x_inc)); kernel1.SetArgument(4, temp_buffer()); // Event waiting list auto eventWaitList = std::vector(); // Launches the main kernel auto global1 = std::vector{db_["WGS1"]*temp_size}; auto local1 = std::vector{db_["WGS1"]}; auto kernelEvent = Event(); RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); eventWaitList.push_back(kernelEvent); // Sets the arguments for the epilogue kernel kernel2.SetArgument(0, temp_buffer()); kernel2.SetArgument(1, asum_buffer()); kernel2.SetArgument(2, static_cast(asum_offset)); // Launches the epilogue kernel auto global2 = std::vector{db_["WGS2"]}; auto local2 = std::vector{db_["WGS2"]}; RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= // Compiles the templated class template class Xasum; template class Xasum; template class Xasum; template class Xasum; template class Xasum; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level1/xasum.hpp000066400000000000000000000026441463263031500205130ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xasum routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XASUM_H_ #define CLBLAST_ROUTINES_XASUM_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xasum: public Routine { public: // Constructor Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM"); // Templated-precision implementation of the routine void DoAsum(const size_t n, const Buffer &asum_buffer, const size_t asum_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XASUM_H_ #endif CLBlast-1.6.3/src/routines/level1/xaxpy.cpp000066400000000000000000000102331463263031500205130ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xaxpy class (see the header for information about the class). // // ================================================================================================= #include "routines/level1/xaxpy.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xaxpy::Xaxpy(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xaxpy"}, PrecisionValue(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xaxpy.opencl" }) { } // ================================================================================================= // The main routine template void Xaxpy::DoAxpy(const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity TestVectorX(n, x_buffer, x_offset, x_inc); TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) && (y_offset == 0) && (y_inc == 1) && IsMultiple(n, db_["WPT"]*db_["VW"]); const auto use_fastest_kernel = use_faster_kernel && IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); // If possible, run the fast-version of the kernel const auto kernel_name = (use_fastest_kernel) ? "XaxpyFastest" : (use_faster_kernel) ? "XaxpyFaster" : "Xaxpy"; // Retrieves the Xaxpy kernel from the compiled binary auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_faster_kernel || use_fastest_kernel) { kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, y_buffer()); } else { kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); kernel.SetArgument(5, y_buffer()); kernel.SetArgument(6, static_cast(y_offset)); kernel.SetArgument(7, static_cast(y_inc)); } // Launches the kernel if (use_fastest_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } else if (use_faster_kernel) { auto global = std::vector{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } else { const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } } // ================================================================================================= // Compiles the templated class template class Xaxpy; template class Xaxpy; template class Xaxpy; template class Xaxpy; template class Xaxpy; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level1/xaxpy.hpp000066400000000000000000000027011463263031500205210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xaxpy routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XAXPY_H_ #define CLBLAST_ROUTINES_XAXPY_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xaxpy: public Routine { public: // Constructor Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY"); // Templated-precision implementation of the routine void DoAxpy(const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XAXPY_H_ #endif CLBlast-1.6.3/src/routines/level1/xcopy.cpp000066400000000000000000000070741463263031500205150ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xcopy class (see the header for information about the class). // // ================================================================================================= #include "routines/level1/xcopy.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xcopy::Xcopy(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xaxpy"}, PrecisionValue(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xcopy.opencl" }) { } // ================================================================================================= // The main routine template void Xcopy::DoCopy(const size_t n, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity TestVectorX(n, x_buffer, x_offset, x_inc); TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && (y_offset == 0) && (y_inc == 1) && IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); // If possible, run the fast-version of the kernel auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy"; // Retrieves the Xcopy kernel from the compiled binary auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, x_buffer()); kernel.SetArgument(2, y_buffer()); } else { kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, x_buffer()); kernel.SetArgument(2, static_cast(x_offset)); kernel.SetArgument(3, static_cast(x_inc)); kernel.SetArgument(4, y_buffer()); kernel.SetArgument(5, static_cast(y_offset)); kernel.SetArgument(6, static_cast(y_inc)); } // Launches the kernel if (use_fast_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } } // ================================================================================================= // Compiles the templated class template class Xcopy; template class Xcopy; template class Xcopy; template class Xcopy; template class Xcopy; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level1/xcopy.hpp000066400000000000000000000026621463263031500205200ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xcopy routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XCOPY_H_ #define CLBLAST_ROUTINES_XCOPY_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xcopy: public Routine { public: // Constructor Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY"); // Templated-precision implementation of the routine void DoCopy(const size_t n, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XCOPY_H_ #endif CLBlast-1.6.3/src/routines/level1/xdot.cpp000066400000000000000000000072611463263031500203270ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xdot class (see the header for information about the class). // // ================================================================================================= #include "routines/level1/xdot.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xdot::Xdot(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xdot"}, PrecisionValue(), {}, { #include "../../kernels/level1/xdot.opencl" }) { } // ================================================================================================= // The main routine template void Xdot::DoDot(const size_t n, const Buffer &dot_buffer, const size_t dot_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const bool do_conjugate) { // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity TestVectorX(n, x_buffer, x_offset, x_inc); TestVectorY(n, y_buffer, y_offset, y_inc); TestVectorScalar(1, dot_buffer, dot_offset); // Retrieves the Xdot kernels from the compiled binary auto kernel1 = Kernel(program_, "Xdot"); auto kernel2 = Kernel(program_, "XdotEpilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; auto temp_buffer = Buffer(context_, temp_size); // Sets the kernel arguments kernel1.SetArgument(0, static_cast(n)); kernel1.SetArgument(1, x_buffer()); kernel1.SetArgument(2, static_cast(x_offset)); kernel1.SetArgument(3, static_cast(x_inc)); kernel1.SetArgument(4, y_buffer()); kernel1.SetArgument(5, static_cast(y_offset)); kernel1.SetArgument(6, static_cast(y_inc)); kernel1.SetArgument(7, temp_buffer()); kernel1.SetArgument(8, static_cast(do_conjugate)); // Event waiting list auto eventWaitList = std::vector(); // Launches the main kernel auto global1 = std::vector{db_["WGS1"]*temp_size}; auto local1 = std::vector{db_["WGS1"]}; auto kernelEvent = Event(); RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); eventWaitList.push_back(kernelEvent); // Sets the arguments for the epilogue kernel kernel2.SetArgument(0, temp_buffer()); kernel2.SetArgument(1, dot_buffer()); kernel2.SetArgument(2, static_cast(dot_offset)); // Launches the epilogue kernel auto global2 = std::vector{db_["WGS2"]}; auto local2 = std::vector{db_["WGS2"]}; RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= // Compiles the templated class template class Xdot; template class Xdot; template class Xdot; template class Xdot; template class Xdot; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level1/xdot.hpp000066400000000000000000000030311463263031500203230ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xdot routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XDOT_H_ #define CLBLAST_ROUTINES_XDOT_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xdot: public Routine { public: // Constructor Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT"); // Templated-precision implementation of the routine void DoDot(const size_t n, const Buffer &dot_buffer, const size_t dot_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const bool do_conjugate = false); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XDOT_H_ #endif CLBlast-1.6.3/src/routines/level1/xdotc.cpp000066400000000000000000000035131463263031500204660ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xdotc class (see the header for information about the class). // // ================================================================================================= #include "routines/level1/xdotc.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xdotc::Xdotc(Queue &queue, EventPointer event, const std::string &name): Xdot(queue, event, name) { } // ================================================================================================= // The main routine template void Xdotc::DoDotc(const size_t n, const Buffer &dot_buffer, const size_t dot_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { DoDot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, true); } // ================================================================================================= // Compiles the templated class template class Xdotc; template class Xdotc; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level1/xdotc.hpp000066400000000000000000000030771463263031500205000ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xdotc routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XDOTC_H_ #define CLBLAST_ROUTINES_XDOTC_H_ #include "routines/level1/xdot.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xdotc: public Xdot { public: // Uses the regular Xdot routine using Xdot::DoDot; // Constructor Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC"); // Templated-precision implementation of the routine void DoDotc(const size_t n, const Buffer &dot_buffer, const size_t dot_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XDOTC_H_ #endif CLBlast-1.6.3/src/routines/level1/xdotu.cpp000066400000000000000000000034721463263031500205140ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xdotu class (see the header for information about the class). // // ================================================================================================= #include "routines/level1/xdotu.hpp" #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xdotu::Xdotu(Queue &queue, EventPointer event, const std::string &name): Xdot(queue, event, name) { } // ================================================================================================= // The main routine template void Xdotu::DoDotu(const size_t n, const Buffer &dot_buffer, const size_t dot_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { DoDot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, false); } // ================================================================================================= // Compiles the templated class template class Xdotu; template class Xdotu; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level1/xdotu.hpp000066400000000000000000000030771463263031500205220ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xdotu routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XDOTU_H_ #define CLBLAST_ROUTINES_XDOTU_H_ #include "routines/level1/xdot.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xdotu: public Xdot { public: // Uses the regular Xdot routine using Xdot::DoDot; // Constructor Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU"); // Templated-precision implementation of the routine void DoDotu(const size_t n, const Buffer &dot_buffer, const size_t dot_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XDOTU_H_ #endif CLBlast-1.6.3/src/routines/level1/xmax.hpp000066400000000000000000000033701463263031500203300ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xmax routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XMAX_H_ #define CLBLAST_ROUTINES_XMAX_H_ #include "routine.hpp" #include "routines/level1/xamax.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xmax: public Xamax { public: // Members and methods from the base class using Xamax::DoAmax; // Constructor Xmax(Queue &queue, EventPointer event, const std::string &name = "MAX"): Xamax(queue, event, name) { } // Forwards to the regular absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. void DoMax(const size_t n, const Buffer &imax_buffer, const size_t imax_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc); } }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XMAX_H_ #endif CLBlast-1.6.3/src/routines/level1/xmin.hpp000066400000000000000000000033741463263031500203320ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xmin routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XMIN_H_ #define CLBLAST_ROUTINES_XMIN_H_ #include "routine.hpp" #include "routines/level1/xamax.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xmin: public Xamax { public: // Members and methods from the base class using Xamax::DoAmax; // Constructor Xmin(Queue &queue, EventPointer event, const std::string &name = "MIN"): Xamax(queue, event, name) { } // Forwards to the regular max-absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. void DoMin(const size_t n, const Buffer &imin_buffer, const size_t imin_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); } }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XMIN_H_ #endif CLBlast-1.6.3/src/routines/level1/xnrm2.cpp000066400000000000000000000065151463263031500204200ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xnrm2 class (see the header for information about the class). // // ================================================================================================= #include "routines/level1/xnrm2.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xnrm2::Xnrm2(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xdot"}, PrecisionValue(), {}, { #include "../../kernels/level1/xnrm2.opencl" }) { } // ================================================================================================= // The main routine template void Xnrm2::DoNrm2(const size_t n, const Buffer &nrm2_buffer, const size_t nrm2_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity TestVectorX(n, x_buffer, x_offset, x_inc); TestVectorScalar(1, nrm2_buffer, nrm2_offset); // Retrieves the Xnrm2 kernels from the compiled binary auto kernel1 = Kernel(program_, "Xnrm2"); auto kernel2 = Kernel(program_, "Xnrm2Epilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; auto temp_buffer = Buffer(context_, temp_size); // Sets the kernel arguments kernel1.SetArgument(0, static_cast(n)); kernel1.SetArgument(1, x_buffer()); kernel1.SetArgument(2, static_cast(x_offset)); kernel1.SetArgument(3, static_cast(x_inc)); kernel1.SetArgument(4, temp_buffer()); // Event waiting list auto eventWaitList = std::vector(); // Launches the main kernel auto global1 = std::vector{db_["WGS1"]*temp_size}; auto local1 = std::vector{db_["WGS1"]}; auto kernelEvent = Event(); RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); eventWaitList.push_back(kernelEvent); // Sets the arguments for the epilogue kernel kernel2.SetArgument(0, temp_buffer()); kernel2.SetArgument(1, nrm2_buffer()); kernel2.SetArgument(2, static_cast(nrm2_offset)); // Launches the epilogue kernel auto global2 = std::vector{db_["WGS2"]}; auto local2 = std::vector{db_["WGS2"]}; RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= // Compiles the templated class template class Xnrm2; template class Xnrm2; template class Xnrm2; template class Xnrm2; template class Xnrm2; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level1/xnrm2.hpp000066400000000000000000000026441463263031500204240ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xnrm2 routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XNRM2_H_ #define CLBLAST_ROUTINES_XNRM2_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xnrm2: public Routine { public: // Constructor Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2"); // Templated-precision implementation of the routine void DoNrm2(const size_t n, const Buffer &nrm2_buffer, const size_t nrm2_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XNRM2_H_ #endif CLBlast-1.6.3/src/routines/level1/xscal.cpp000066400000000000000000000064501463263031500204620ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xscal class (see the header for information about the class). // // ================================================================================================= #include "routines/level1/xscal.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xscal::Xscal(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xaxpy"}, PrecisionValue(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xscal.opencl" }) { } // ================================================================================================= // The main routine template void Xscal::DoScal(const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vector for validity TestVectorX(n, x_buffer, x_offset, x_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); // If possible, run the fast-version of the kernel auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal"; // Retrieves the Xscal kernel from the compiled binary auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); } else { kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); } // Launches the kernel if (use_fast_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } } // ================================================================================================= // Compiles the templated class template class Xscal; template class Xscal; template class Xscal; template class Xscal; template class Xscal; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level1/xscal.hpp000066400000000000000000000025551463263031500204710ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xscal routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSCAL_H_ #define CLBLAST_ROUTINES_XSCAL_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xscal: public Routine { public: // Constructor Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL"); // Templated-precision implementation of the routine void DoScal(const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSCAL_H_ #endif CLBlast-1.6.3/src/routines/level1/xsum.hpp000066400000000000000000000033511463263031500203460ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsum routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSUM_H_ #define CLBLAST_ROUTINES_XSUM_H_ #include "routine.hpp" #include "routines/level1/xasum.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xsum: public Xasum { public: // Members and methods from the base class using Xasum::DoAsum; // Constructor Xsum(Queue &queue, EventPointer event, const std::string &name = "SUM"): Xasum(queue, event, name) { } // Forwards to the regular absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. void DoSum(const size_t n, const Buffer &sum_buffer, const size_t sum_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc); } }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSUM_H_ #endif CLBlast-1.6.3/src/routines/level1/xswap.cpp000066400000000000000000000070741463263031500205150ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xswap class (see the header for information about the class). // // ================================================================================================= #include "routines/level1/xswap.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xswap::Xswap(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xaxpy"}, PrecisionValue(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xswap.opencl" }) { } // ================================================================================================= // The main routine template void Xswap::DoSwap(const size_t n, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity TestVectorX(n, x_buffer, x_offset, x_inc); TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && (y_offset == 0) && (y_inc == 1) && IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); // If possible, run the fast-version of the kernel auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap"; // Retrieves the Xswap kernel from the compiled binary auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, x_buffer()); kernel.SetArgument(2, y_buffer()); } else { kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, x_buffer()); kernel.SetArgument(2, static_cast(x_offset)); kernel.SetArgument(3, static_cast(x_inc)); kernel.SetArgument(4, y_buffer()); kernel.SetArgument(5, static_cast(y_offset)); kernel.SetArgument(6, static_cast(y_inc)); } // Launches the kernel if (use_fast_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } } // ================================================================================================= // Compiles the templated class template class Xswap; template class Xswap; template class Xswap; template class Xswap; template class Xswap; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level1/xswap.hpp000066400000000000000000000026621463263031500205200ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xswap routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSWAP_H_ #define CLBLAST_ROUTINES_XSWAP_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xswap: public Routine { public: // Constructor Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP"); // Templated-precision implementation of the routine void DoSwap(const size_t n, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSWAP_H_ #endif CLBlast-1.6.3/src/routines/level2/000077500000000000000000000000001463263031500166405ustar00rootroot00000000000000CLBlast-1.6.3/src/routines/level2/xgbmv.cpp000066400000000000000000000052121463263031500204670ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xgbmv class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xgbmv.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xgbmv::Xgbmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } // ================================================================================================= // The main routine template void Xgbmv::DoGbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // Reverses the upper and lower band count auto rotated = (layout == Layout::kRowMajor); auto kl_real = (rotated) ? ku : kl; auto ku_real = (rotated) ? kl : ku; // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. // The specific hermitian matrix-accesses are implemented in the kernel guarded by the // ROUTINE_GBMV define. bool fast_kernels = false; MatVec(layout, a_transpose, m, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, fast_kernels, fast_kernels, 0, false, kl_real, ku_real); } // ================================================================================================= // Compiles the templated class template class Xgbmv; template class Xgbmv; template class Xgbmv; template class Xgbmv; template class Xgbmv; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xgbmv.hpp000066400000000000000000000036101463263031500204740ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xgbmv routine. It is based on the generalized mat-vec multiplication // routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the // "MatVec" function directly. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XGBMV_H_ #define CLBLAST_ROUTINES_XGBMV_H_ #include "routines/level2/xgemv.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xgbmv: public Xgemv { public: // Uses the generic matrix-vector routine using Xgemv::MatVec; // Constructor Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV"); // Templated-precision implementation of the routine void DoGbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XGBMV_H_ #endif CLBlast-1.6.3/src/routines/level2/xgemv.cpp000066400000000000000000000155571463263031500205070ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xgemv class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xgemv.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xgemv::Xgemv(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xgemv", "XgemvFast", "XgemvFastRot", "TrsvRoutine"}, PrecisionValue(), {}, { #include "../../kernels/level2/xgemv.opencl" #include "../../kernels/level2/xgemv_fast.opencl" #include "../../kernels/level2/xtrsv.opencl" }) { } // ================================================================================================= // The main routine template void Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // Performs the matrix-vector multiplication MatVec(layout, a_transpose, m, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, true, true, 0, false, 0, 0); // N/A for this routine } // ================================================================================================= // The generic implementation, also suited for other (non general) matrix-vector multiplications template void Xgemv::MatVec(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, bool fast_kernel, bool fast_kernel_rot, const size_t parameter, const bool packed, const size_t kl, const size_t ku) { // Makes sure all dimensions are larger than zero if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrix has an alternative layout (row or column-major). const auto a_altlayout = (layout == Layout::kRowMajor); auto a_one = (a_altlayout) ? n : m; const auto a_two = (a_altlayout) ? m : n; // Swap m and n if the matrix is transposed const auto a_transposed = (a_transpose != Transpose::kNo); const auto m_real = (a_transposed) ? n : m; const auto n_real = (a_transposed) ? m : n; // Special adjustments for banded matrices if (kl != 0 || ku != 0) { a_one = kl+ku+1; } // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator) const auto a_rotated = a_transposed ^ a_altlayout; // In case of complex data-types, the transpose can also become a conjugate transpose const auto a_conjugate = (a_transpose == Transpose::kConjugate); // Tests the matrix and the vectors for validity if (packed) { TestMatrixAP(n, a_buffer, a_offset); } else { TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); } TestVectorX(n_real, x_buffer, x_offset, x_inc); TestVectorY(m_real, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && IsMultiple(n, db_["WGS2"]) && IsMultiple(a_ld, db_["VW2"]); fast_kernel_rot = fast_kernel_rot && (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && IsMultiple(n, db_["WGS3"]) && IsMultiple(a_ld, db_["VW3"]); // If possible, run the fast-version (rotated or non-rotated) of the kernel auto kernel_name = std::string{"Xgemv"}; const auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]); auto global_size = m_ceiled / db_["WPT1"]; auto local_size = db_["WGS1"]; if (fast_kernel) { kernel_name = "XgemvFast"; global_size = m_real / db_["WPT2"]; local_size = db_["WGS2"]; } if (fast_kernel_rot) { kernel_name = "XgemvFastRot"; global_size = m_real; local_size = db_["WGS3"]; } // Retrieves the Xgemv kernel from the compiled binary auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(m_real)); kernel.SetArgument(1, static_cast(n_real)); kernel.SetArgument(2, GetRealArg(alpha)); kernel.SetArgument(3, GetRealArg(beta)); kernel.SetArgument(4, static_cast(a_rotated)); kernel.SetArgument(5, a_buffer()); kernel.SetArgument(6, static_cast(a_offset)); kernel.SetArgument(7, static_cast(a_ld)); kernel.SetArgument(8, x_buffer()); kernel.SetArgument(9, static_cast(x_offset)); kernel.SetArgument(10, static_cast(x_inc)); kernel.SetArgument(11, y_buffer()); kernel.SetArgument(12, static_cast(y_offset)); kernel.SetArgument(13, static_cast(y_inc)); kernel.SetArgument(14, static_cast(a_conjugate)); kernel.SetArgument(15, static_cast(parameter)); // extra parameter used for symm/herm kernel.SetArgument(16, static_cast(kl)); // only used for banded matrices kernel.SetArgument(17, static_cast(ku)); // only used for banded matrices // Launches the kernel auto global = std::vector{global_size}; auto local = std::vector{local_size}; RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= // Compiles the templated class template class Xgemv; template class Xgemv; template class Xgemv; template class Xgemv; template class Xgemv; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xgemv.hpp000066400000000000000000000044301463263031500205000ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xgemv routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XGEMV_H_ #define CLBLAST_ROUTINES_XGEMV_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xgemv: public Routine { public: // Constructor Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV"); // Templated-precision implementation of the routine void DoGemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); // Generic version used also for other matrix-vector multiplications void MatVec(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, bool fast_kernel, bool fast_kernel_rot, const size_t parameter, const bool packed, const size_t kl, const size_t ku); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XGEMV_H_ #endif CLBlast-1.6.3/src/routines/level2/xger.cpp000066400000000000000000000072301463263031500203130ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xger class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xger.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xger::Xger(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xger"}, PrecisionValue(), {}, { #include "../../kernels/level2/level2.opencl" #include "../../kernels/level2/xger.opencl" }) { } // ================================================================================================= // The main routine template void Xger::DoGer(const Layout layout, const size_t m, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { // Makes sure all dimensions are larger than zero if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrix has an alternative layout (row or column-major). const auto a_is_rowmajor = (layout == Layout::kRowMajor); const auto a_one = (a_is_rowmajor) ? n : m; const auto a_two = (a_is_rowmajor) ? m : n; // Tests the matrix and the vectors for validity TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); TestVectorX(m, x_buffer, x_offset, x_inc); TestVectorY(n, y_buffer, y_offset, y_inc); // Retrieves the kernel from the compiled binary auto kernel = Kernel(program_, "Xger"); // Sets the kernel arguments kernel.SetArgument(0, static_cast(a_one)); kernel.SetArgument(1, static_cast(a_two)); kernel.SetArgument(2, GetRealArg(alpha)); kernel.SetArgument(3, x_buffer()); kernel.SetArgument(4, static_cast(x_offset)); kernel.SetArgument(5, static_cast(x_inc)); kernel.SetArgument(6, y_buffer()); kernel.SetArgument(7, static_cast(y_offset)); kernel.SetArgument(8, static_cast(y_inc)); kernel.SetArgument(9, a_buffer()); kernel.SetArgument(10, static_cast(a_offset)); kernel.SetArgument(11, static_cast(a_ld)); kernel.SetArgument(12, static_cast(a_is_rowmajor)); // Launches the kernel auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]); auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]); auto global = std::vector{a_one_ceiled, a_two_ceiled}; auto local = std::vector{db_["WGS1"], db_["WGS2"]}; RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= // Compiles the templated class template class Xger; template class Xger; template class Xger; template class Xger; template class Xger; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xger.hpp000066400000000000000000000031101463263031500203110ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xger routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XGER_H_ #define CLBLAST_ROUTINES_XGER_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xger: public Routine { public: // Constructor Xger(Queue &queue, EventPointer event, const std::string &name = "GER"); // Templated-precision implementation of the routine void DoGer(const Layout layout, const size_t m, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XGER_H_ #endif CLBlast-1.6.3/src/routines/level2/xgerc.cpp000066400000000000000000000040521463263031500204550ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xgerc class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xgerc.hpp" #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xgerc::Xgerc(Queue &queue, EventPointer event, const std::string &name): Xger(queue, event, name) { } // ================================================================================================= // The main routine template void Xgerc::DoGerc(const Layout layout, const size_t m, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { // Regular Ger operation on complex data, plus conjugation in the kernel guarded by the // ROUTINE_GERC guard. DoGer(layout, m, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld); } // ================================================================================================= // Compiles the templated class template class Xgerc; template class Xgerc; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xgerc.hpp000066400000000000000000000032361463263031500204650ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xgerc routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XGERC_H_ #define CLBLAST_ROUTINES_XGERC_H_ #include "routines/level2/xger.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xgerc: public Xger { public: // Uses the regular Xger routine using Xger::DoGer; // Constructor Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC"); // Templated-precision implementation of the routine void DoGerc(const Layout layout, const size_t m, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XGERC_H_ #endif CLBlast-1.6.3/src/routines/level2/xgeru.cpp000066400000000000000000000037421463263031500205040ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xgeru class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xgeru.hpp" #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xgeru::Xgeru(Queue &queue, EventPointer event, const std::string &name): Xger(queue, event, name) { } // ================================================================================================= // The main routine template void Xgeru::DoGeru(const Layout layout, const size_t m, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { // Regular Ger operation on complex data DoGer(layout, m, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld); } // ================================================================================================= // Compiles the templated class template class Xgeru; template class Xgeru; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xgeru.hpp000066400000000000000000000032361463263031500205070ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xgeru routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XGERU_H_ #define CLBLAST_ROUTINES_XGERU_H_ #include "routines/level2/xger.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xgeru: public Xger { public: // Uses the regular Xger routine using Xger::DoGer; // Constructor Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU"); // Templated-precision implementation of the routine void DoGeru(const Layout layout, const size_t m, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XGERU_H_ #endif CLBlast-1.6.3/src/routines/level2/xhbmv.cpp000066400000000000000000000051131463263031500204700ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhbmv class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xhbmv.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xhbmv::Xhbmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } // ================================================================================================= // The main routine template void Xhbmv::DoHbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the // ROUTINE_HBMV define. bool fast_kernels = false; MatVec(layout, Transpose::kNo, n, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, fast_kernels, fast_kernels, is_upper, false, k, 0); } // ================================================================================================= // Compiles the templated class template class Xhbmv; template class Xhbmv; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xhbmv.hpp000066400000000000000000000035421463263031500205010ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhbmv routine. It is based on the generalized mat-vec multiplication // routine (Xgemv). The Xhbmv class inherits from the templated class Xgemv, allowing it to call the // "MatVec" function directly. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XHBMV_H_ #define CLBLAST_ROUTINES_XHBMV_H_ #include "routines/level2/xgemv.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xhbmv: public Xgemv { public: // Uses the generic matrix-vector routine using Xgemv::MatVec; // Constructor Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV"); // Templated-precision implementation of the routine void DoHbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XHBMV_H_ #endif CLBlast-1.6.3/src/routines/level2/xhemv.cpp000066400000000000000000000050641463263031500205000ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhemv class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xhemv.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xhemv::Xhemv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } // ================================================================================================= // The main routine template void Xhemv::DoHemv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. // The specific hermitian matrix-accesses are implemented in the kernel guarded by the // ROUTINE_HEMV define. bool fast_kernels = false; MatVec(layout, Transpose::kNo, n, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, fast_kernels, fast_kernels, is_upper, false, 0, 0); } // ================================================================================================= // Compiles the templated class template class Xhemv; template class Xhemv; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xhemv.hpp000066400000000000000000000035221463263031500205020ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhemv routine. It is based on the generalized mat-vec multiplication // routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the // "MatVec" function directly. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XHEMV_H_ #define CLBLAST_ROUTINES_XHEMV_H_ #include "routines/level2/xgemv.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xhemv: public Xgemv { public: // Uses the generic matrix-vector routine using Xgemv::MatVec; // Constructor Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV"); // Templated-precision implementation of the routine void DoHemv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XHEMV_H_ #endif CLBlast-1.6.3/src/routines/level2/xher.cpp000066400000000000000000000105061463263031500203140ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xher class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xher.hpp" #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xher::Xher(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xger"}, PrecisionValue(), {}, { #include "../../kernels/level2/level2.opencl" #include "../../kernels/level2/xher.opencl" }) { } // ================================================================================================= // Specializations to compute alpha of type 'T' template <> float2 Xher::GetAlpha(const float alpha) { return float2{alpha, 0.0f}; } template <> double2 Xher::GetAlpha(const double alpha) { return double2{alpha, 0.0}; } template <> float Xher::GetAlpha(const float alpha) { return alpha; } template <> double Xher::GetAlpha(const double alpha) { return alpha; } template <> half Xher::GetAlpha(const half alpha) { return alpha; } // ================================================================================================= // The main routine template void Xher::DoHer(const Layout layout, const Triangle triangle, const size_t n, const U alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const bool packed) { // Makes sure the dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // The data is either in the upper or lower triangle const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); const auto is_rowmajor = (layout == Layout::kRowMajor); // Tests the matrix and the vectors for validity if (packed) { TestMatrixAP(n, a_buffer, a_offset); } else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); } TestVectorX(n, x_buffer, x_offset, x_inc); // If alpha is zero an update is not required if (alpha == U{0}) { return; } // Creates a matching version of alpha const auto matching_alpha = GetAlpha(alpha); // Retrieves the kernel from the compiled binary auto kernel = Kernel(program_, "Xher"); // Sets the kernel arguments kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, GetRealArg(matching_alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); kernel.SetArgument(5, a_buffer()); kernel.SetArgument(6, static_cast(a_offset)); kernel.SetArgument(7, static_cast(a_ld)); kernel.SetArgument(8, static_cast(is_upper)); kernel.SetArgument(9, static_cast(is_rowmajor)); // Launches the kernel auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); auto global = std::vector{global_one, global_two}; auto local = std::vector{db_["WGS1"], db_["WGS2"]}; RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= // Compiles the templated class template class Xher; template class Xher; template class Xher; template class Xher; template class Xher; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xher.hpp000066400000000000000000000032001463263031500203120ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xher routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XHER_H_ #define CLBLAST_ROUTINES_XHER_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xher: public Routine { public: // Constructor Xher(Queue &queue, EventPointer event, const std::string &name = "HER"); // Translates alpha of type 'U' into type 'T' T GetAlpha(const U alpha); // Templated-precision implementation of the routine void DoHer(const Layout layout, const Triangle triangle, const size_t n, const U alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const bool packed = false); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XHER_H_ #endif CLBlast-1.6.3/src/routines/level2/xher2.cpp000066400000000000000000000074351463263031500204050ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xher2 class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xher2.hpp" #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xher2::Xher2(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xger"}, PrecisionValue(), {}, { #include "../../kernels/level2/level2.opencl" #include "../../kernels/level2/xher2.opencl" }) { } // ================================================================================================= // The main routine template void Xher2::DoHer2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const bool packed) { // Makes sure the dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // The data is either in the upper or lower triangle const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); const auto is_rowmajor = (layout == Layout::kRowMajor); // Tests the matrix and the vectors for validity if (packed) { TestMatrixAP(n, a_buffer, a_offset); } else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); } TestVectorX(n, x_buffer, x_offset, x_inc); TestVectorY(n, y_buffer, y_offset, y_inc); // Retrieves the kernel from the compiled binary auto kernel = Kernel(program_, "Xher2"); // Sets the kernel arguments kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); kernel.SetArgument(5, y_buffer()); kernel.SetArgument(6, static_cast(y_offset)); kernel.SetArgument(7, static_cast(y_inc)); kernel.SetArgument(8, a_buffer()); kernel.SetArgument(9, static_cast(a_offset)); kernel.SetArgument(10, static_cast(a_ld)); kernel.SetArgument(11, static_cast(is_upper)); kernel.SetArgument(12, static_cast(is_rowmajor)); // Launches the kernel auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); auto global = std::vector{global_one, global_two}; auto local = std::vector{db_["WGS1"], db_["WGS2"]}; RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= // Compiles the templated class template class Xher2; template class Xher2; template class Xher2; template class Xher2; template class Xher2; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xher2.hpp000066400000000000000000000032071463263031500204030ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xher2 routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XHER2_H_ #define CLBLAST_ROUTINES_XHER2_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xher2: public Routine { public: // Constructor Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2"); // Templated-precision implementation of the routine void DoHer2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const bool packed = false); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XHER2_H_ #endif CLBlast-1.6.3/src/routines/level2/xhpmv.cpp000066400000000000000000000050501463263031500205060ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhpmv class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xhpmv.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xhpmv::Xhpmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } // ================================================================================================= // The main routine template void Xhpmv::DoHpmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &ap_buffer, const size_t ap_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the // ROUTINE_HPMV define. bool fast_kernels = false; MatVec(layout, Transpose::kNo, n, n, alpha, ap_buffer, ap_offset, n, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, fast_kernels, fast_kernels, is_upper, true, 0, 0); } // ================================================================================================= // Compiles the templated class template class Xhpmv; template class Xhpmv; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xhpmv.hpp000066400000000000000000000035011463263031500205120ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhpmv routine. It is based on the generalized mat-vec multiplication // routine (Xgemv). The Xhpmv class inherits from the templated class Xgemv, allowing it to call the // "MatVec" function directly. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XHPMV_H_ #define CLBLAST_ROUTINES_XHPMV_H_ #include "routines/level2/xgemv.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xhpmv: public Xgemv { public: // Uses the generic matrix-vector routine using Xgemv::MatVec; // Constructor Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV"); // Templated-precision implementation of the routine void DoHpmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &ap_buffer, const size_t ap_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XHPMV_H_ #endif CLBlast-1.6.3/src/routines/level2/xhpr.cpp000066400000000000000000000037061463263031500203330ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhpr class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xhpr.hpp" #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xhpr::Xhpr(Queue &queue, EventPointer event, const std::string &name): Xher(queue, event, name) { } // ================================================================================================= // The main routine template void Xhpr::DoHpr(const Layout layout, const Triangle triangle, const size_t n, const U alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &ap_buffer, const size_t ap_offset) { // Specific Xhpr functionality is implemented in the kernel using defines DoHer(layout, triangle, n, alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, n, true); // packed matrix } // ================================================================================================= // Compiles the templated class template class Xhpr; template class Xhpr; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xhpr.hpp000066400000000000000000000031061463263031500203320ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhpr routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XHPR_H_ #define CLBLAST_ROUTINES_XHPR_H_ #include "routines/level2/xher.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xhpr: public Xher { public: // Uses the regular Xher routine using Xher::DoHer; // Constructor Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR"); // Templated-precision implementation of the routine void DoHpr(const Layout layout, const Triangle triangle, const size_t n, const U alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &ap_buffer, const size_t ap_offset); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XHPR_H_ #endif CLBlast-1.6.3/src/routines/level2/xhpr2.cpp000066400000000000000000000040471463263031500204140ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhpr2 class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xhpr2.hpp" #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xhpr2::Xhpr2(Queue &queue, EventPointer event, const std::string &name): Xher2(queue, event, name) { } // ================================================================================================= // The main routine template void Xhpr2::DoHpr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &ap_buffer, const size_t ap_offset) { // Specific Xhpr2 functionality is implemented in the kernel using defines DoHer2(layout, triangle, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, n, true); // packed matrix } // ================================================================================================= // Compiles the templated class template class Xhpr2; template class Xhpr2; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xhpr2.hpp000066400000000000000000000032331463263031500204150ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhpr2 routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XHPR2_H_ #define CLBLAST_ROUTINES_XHPR2_H_ #include "routines/level2/xher2.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xhpr2: public Xher2 { public: // Uses the regular Xher2 routine using Xher2::DoHer2; // Constructor Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2"); // Templated-precision implementation of the routine void DoHpr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &ap_buffer, const size_t ap_offset); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XHPR2_H_ #endif CLBlast-1.6.3/src/routines/level2/xsbmv.cpp000066400000000000000000000051451463263031500205100ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsbmv class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xsbmv.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xsbmv::Xsbmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } // ================================================================================================= // The main routine template void Xsbmv::DoSbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the // ROUTINE_SBMV define. bool fast_kernels = false; MatVec(layout, Transpose::kNo, n, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, fast_kernels, fast_kernels, is_upper, false, k, 0); } // ================================================================================================= // Compiles the templated class template class Xsbmv; template class Xsbmv; template class Xsbmv; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xsbmv.hpp000066400000000000000000000035421463263031500205140ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsbmv routine. It is based on the generalized mat-vec multiplication // routine (Xgemv). The Xsbmv class inherits from the templated class Xgemv, allowing it to call the // "MatVec" function directly. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSBMV_H_ #define CLBLAST_ROUTINES_XSBMV_H_ #include "routines/level2/xgemv.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xsbmv: public Xgemv { public: // Uses the generic matrix-vector routine using Xgemv::MatVec; // Constructor Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV"); // Templated-precision implementation of the routine void DoSbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSBMV_H_ #endif CLBlast-1.6.3/src/routines/level2/xspmv.cpp000066400000000000000000000051021463263031500205170ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xspmv class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xspmv.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xspmv::Xspmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } // ================================================================================================= // The main routine template void Xspmv::DoSpmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &ap_buffer, const size_t ap_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the // ROUTINE_SPMV define. bool fast_kernels = false; MatVec(layout, Transpose::kNo, n, n, alpha, ap_buffer, ap_offset, n, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, fast_kernels, fast_kernels, is_upper, true, 0, 0); } // ================================================================================================= // Compiles the templated class template class Xspmv; template class Xspmv; template class Xspmv; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xspmv.hpp000066400000000000000000000035011463263031500205250ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xspmv routine. It is based on the generalized mat-vec multiplication // routine (Xgemv). The Xspmv class inherits from the templated class Xgemv, allowing it to call the // "MatVec" function directly. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSPMV_H_ #define CLBLAST_ROUTINES_XSPMV_H_ #include "routines/level2/xgemv.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xspmv: public Xgemv { public: // Uses the generic matrix-vector routine using Xgemv::MatVec; // Constructor Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV"); // Templated-precision implementation of the routine void DoSpmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &ap_buffer, const size_t ap_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSPMV_H_ #endif CLBlast-1.6.3/src/routines/level2/xspr.cpp000066400000000000000000000036541463263031500203500ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xspr class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xspr.hpp" #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xspr::Xspr(Queue &queue, EventPointer event, const std::string &name): Xher(queue, event, name) { } // ================================================================================================= // The main routine template void Xspr::DoSpr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &ap_buffer, const size_t ap_offset) { // Specific Xspr functionality is implemented in the kernel using defines DoHer(layout, triangle, n, alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, n, true); // packed matrix } // ================================================================================================= // Compiles the templated class template class Xspr; template class Xspr; template class Xspr; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xspr.hpp000066400000000000000000000030721463263031500203470ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xspr routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSPR_H_ #define CLBLAST_ROUTINES_XSPR_H_ #include "routines/level2/xher.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xspr: public Xher { public: // Uses the regular Xher routine using Xher::DoHer; // Constructor Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR"); // Templated-precision implementation of the routine void DoSpr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &ap_buffer, const size_t ap_offset); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSPR_H_ #endif CLBlast-1.6.3/src/routines/level2/xspr2.cpp000066400000000000000000000041011463263031500204160ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xspr2 class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xspr2.hpp" #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xspr2::Xspr2(Queue &queue, EventPointer event, const std::string &name): Xher2(queue, event, name) { } // ================================================================================================= // The main routine template void Xspr2::DoSpr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &ap_buffer, const size_t ap_offset) { // Specific Xspr2 functionality is implemented in the kernel using defines DoHer2(layout, triangle, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, n, true); // packed matrix } // ================================================================================================= // Compiles the templated class template class Xspr2; template class Xspr2; template class Xspr2; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xspr2.hpp000066400000000000000000000032331463263031500204300ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xspr2 routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSPR2_H_ #define CLBLAST_ROUTINES_XSPR2_H_ #include "routines/level2/xher2.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xspr2: public Xher2 { public: // Uses the regular Xher2 routine using Xher2::DoHer2; // Constructor Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2"); // Templated-precision implementation of the routine void DoSpr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &ap_buffer, const size_t ap_offset); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSPR2_H_ #endif CLBlast-1.6.3/src/routines/level2/xsymv.cpp000066400000000000000000000051161463263031500205350ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsymv class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xsymv.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xsymv::Xsymv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } // ================================================================================================= // The main routine template void Xsymv::DoSymv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. // The specific symmetric matrix-accesses are implemented in the kernel guarded by the // ROUTINE_SYMV define. bool fast_kernels = false; MatVec(layout, Transpose::kNo, n, n, alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, beta, y_buffer, y_offset, y_inc, fast_kernels, fast_kernels, is_upper, false, 0, 0); } // ================================================================================================= // Compiles the templated class template class Xsymv; template class Xsymv; template class Xsymv; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xsymv.hpp000066400000000000000000000035221463263031500205410ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsymv routine. It is based on the generalized mat-vec multiplication // routine (Xgemv). The Xsymv class inherits from the templated class Xgemv, allowing it to call the // "MatVec" function directly. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSYMV_H_ #define CLBLAST_ROUTINES_XSYMV_H_ #include "routines/level2/xgemv.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xsymv: public Xgemv { public: // Uses the generic matrix-vector routine using Xgemv::MatVec; // Constructor Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV"); // Templated-precision implementation of the routine void DoSymv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSYMV_H_ #endif CLBlast-1.6.3/src/routines/level2/xsyr.cpp000066400000000000000000000036371463263031500203620ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsyr class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xsyr.hpp" #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xsyr::Xsyr(Queue &queue, EventPointer event, const std::string &name): Xher(queue, event, name) { } // ================================================================================================= // The main routine template void Xsyr::DoSyr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { // Specific Xsyr functionality is implemented in the kernel using defines DoHer(layout, triangle, n, alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld); } // ================================================================================================= // Compiles the templated class template class Xsyr; template class Xsyr; template class Xsyr; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xsyr.hpp000066400000000000000000000031131463263031500203540ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsyr routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSYR_H_ #define CLBLAST_ROUTINES_XSYR_H_ #include "routines/level2/xher.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xsyr: public Xher { public: // Uses the regular Xher routine using Xher::DoHer; // Constructor Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR"); // Templated-precision implementation of the routine void DoSyr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSYR_H_ #endif CLBlast-1.6.3/src/routines/level2/xsyr2.cpp000066400000000000000000000040631463263031500204360ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsyr2 class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xsyr2.hpp" #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xsyr2::Xsyr2(Queue &queue, EventPointer event, const std::string &name): Xher2(queue, event, name) { } // ================================================================================================= // The main routine template void Xsyr2::DoSyr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { // Specific Xsyr2 functionality is implemented in the kernel using defines DoHer2(layout, triangle, n, alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld); } // ================================================================================================= // Compiles the templated class template class Xsyr2; template class Xsyr2; template class Xsyr2; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xsyr2.hpp000066400000000000000000000032541463263031500204440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsyr2 routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSYR2_H_ #define CLBLAST_ROUTINES_XSYR2_H_ #include "routines/level2/xher2.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xsyr2: public Xher2 { public: // Uses the regular Xher2 routine using Xher2::DoHer2; // Constructor Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2"); // Templated-precision implementation of the routine void DoSyr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSYR2_H_ #endif CLBlast-1.6.3/src/routines/level2/xtbmv.cpp000066400000000000000000000067061463263031500205150ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xtbmv class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xtbmv.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xtbmv::Xtbmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } // ================================================================================================= // The main routine template void Xtbmv::DoTbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { // Creates a copy of X: a temporary scratch buffer const auto x_size = (1 + (n - 1) * x_inc) + x_offset; auto scratch_buffer = Buffer(context_, x_size); x_buffer.CopyTo(queue_, x_size, scratch_buffer); // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); // Adds '2' to the parameter if the diagonal is unit auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the // ROUTINE_TBMV define. auto fast_kernels = false; try { MatVec(layout, a_transpose, n, n, ConstantOne(), a_buffer, a_offset, a_ld, scratch_buffer, x_offset, x_inc, ConstantZero(), x_buffer, x_offset, x_inc, fast_kernels, fast_kernels, parameter, false, k, 0); } catch (BLASError &e) { // Returns the proper error code (renames vector Y to X) switch (e.status()) { case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details()); case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details()); case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details()); default: throw; } } } // ================================================================================================= // Compiles the templated class template class Xtbmv; template class Xtbmv; template class Xtbmv; template class Xtbmv; template class Xtbmv; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xtbmv.hpp000066400000000000000000000035171463263031500205170ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication // routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the // "MatVec" function directly. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XTBMV_H_ #define CLBLAST_ROUTINES_XTBMV_H_ #include "routines/level2/xgemv.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xtbmv: public Xgemv { public: // Uses the generic matrix-vector routine using Xgemv::queue_; using Xgemv::context_; using Xgemv::MatVec; // Constructor Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV"); // Templated-precision implementation of the routine void DoTbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XTBMV_H_ #endif CLBlast-1.6.3/src/routines/level2/xtpmv.cpp000066400000000000000000000066431463263031500205330ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xtpmv class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xtpmv.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xtpmv::Xtpmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } // ================================================================================================= // The main routine template void Xtpmv::DoTpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const Buffer &ap_buffer, const size_t ap_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { // Creates a copy of X: a temporary scratch buffer const auto x_size = (1 + (n - 1) * x_inc) + x_offset; auto scratch_buffer = Buffer(context_, x_size); x_buffer.CopyTo(queue_, x_size, scratch_buffer); // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); // Adds '2' to the parameter if the diagonal is unit auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the // ROUTINE_TPMV define. auto fast_kernels = false; try { MatVec(layout, a_transpose, n, n, ConstantOne(), ap_buffer, ap_offset, n, scratch_buffer, x_offset, x_inc, ConstantZero(), x_buffer, x_offset, x_inc, fast_kernels, fast_kernels, parameter, true, 0, 0); } catch (BLASError &e) { // Returns the proper error code (renames vector Y to X) switch (e.status()) { case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details()); case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details()); case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details()); default: throw; } } } // ================================================================================================= // Compiles the templated class template class Xtpmv; template class Xtpmv; template class Xtpmv; template class Xtpmv; template class Xtpmv; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xtpmv.hpp000066400000000000000000000034561463263031500205370ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication // routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the // "MatVec" function directly. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XTPMV_H_ #define CLBLAST_ROUTINES_XTPMV_H_ #include "routines/level2/xgemv.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xtpmv: public Xgemv { public: // Uses the generic matrix-vector routine using Xgemv::queue_; using Xgemv::context_; using Xgemv::MatVec; // Constructor Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV"); // Templated-precision implementation of the routine void DoTpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const Buffer &ap_buffer, const size_t ap_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XTPMV_H_ #endif CLBlast-1.6.3/src/routines/level2/xtrmv.cpp000066400000000000000000000066571463263031500205420ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xtrmv class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xtrmv.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xtrmv::Xtrmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } // ================================================================================================= // The main routine template void Xtrmv::DoTrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { // Creates a copy of X: a temporary scratch buffer const auto x_size = (1 + (n - 1) * x_inc) + x_offset; auto scratch_buffer = Buffer(context_, x_size); x_buffer.CopyTo(queue_, x_size, scratch_buffer); // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); // Adds '2' to the parameter if the diagonal is unit auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. // The specific triangular matrix-accesses are implemented in the kernel guarded by the // ROUTINE_TRMV define. auto fast_kernels = false; try { MatVec(layout, a_transpose, n, n, ConstantOne(), a_buffer, a_offset, a_ld, scratch_buffer, x_offset, x_inc, ConstantZero(), x_buffer, x_offset, x_inc, fast_kernels, fast_kernels, parameter, false, 0, 0); } catch (BLASError &e) { // Returns the proper error code (renames vector Y to X) switch (e.status()) { case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details()); case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details()); case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details()); default: throw; } } } // ================================================================================================= // Compiles the templated class template class Xtrmv; template class Xtrmv; template class Xtrmv; template class Xtrmv; template class Xtrmv; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xtrmv.hpp000066400000000000000000000034771463263031500205440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication // routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the // "MatVec" function directly. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XTRMV_H_ #define CLBLAST_ROUTINES_XTRMV_H_ #include "routines/level2/xgemv.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xtrmv: public Xgemv { public: // Uses the generic matrix-vector routine using Xgemv::queue_; using Xgemv::context_; using Xgemv::MatVec; // Constructor Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV"); // Templated-precision implementation of the routine void DoTrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XTRMV_H_ #endif CLBlast-1.6.3/src/routines/level2/xtrsv.cpp000066400000000000000000000171611463263031500205400ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xtrsv class (see the header for information about the class). // // ================================================================================================= #include "routines/level2/xtrsv.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xtrsv::Xtrsv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } // ================================================================================================= template void Xtrsv::Substitution(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_inc, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, EventPointer event) { if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); }; // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1; const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) || (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1; const auto do_conjugate = (a_transpose == Transpose::kConjugate) ? 1 : 0; // The data is either in the upper or lower triangle const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) || (triangle == Triangle::kLower && a_transpose != Transpose::kNo)); // Retrieves the kernel from the compiled binary const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward"; auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, a_buffer()); kernel.SetArgument(2, static_cast(a_offset)); kernel.SetArgument(3, static_cast(a_ld)); kernel.SetArgument(4, b_buffer()); kernel.SetArgument(5, static_cast(b_offset)); kernel.SetArgument(6, static_cast(b_inc)); kernel.SetArgument(7, x_buffer()); kernel.SetArgument(8, static_cast(x_offset)); kernel.SetArgument(9, static_cast(x_inc)); kernel.SetArgument(10, static_cast(is_transposed)); kernel.SetArgument(11, static_cast(is_unit_diagonal)); kernel.SetArgument(12, static_cast(do_conjugate)); // Launches the kernel const auto local = std::vector{db_["TRSV_BLOCK_SIZE"]}; const auto global = std::vector{Ceil(n, db_["TRSV_BLOCK_SIZE"])}; RunKernel(kernel, queue_, device_, global, local, event); } // ================================================================================================= // The main routine template void Xtrsv::DoTrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Some parts of this kernel are not tunable and thus require some minimal OpenCL properties if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16 throw RuntimeErrorCode(StatusCode::kNotImplemented); } // Tests the matrix and vector TestMatrixA(n, n, a_buffer, a_offset, a_ld); TestVectorX(n, b_buffer, b_offset, b_inc); // Creates a copy of B to avoid overwriting input while computing output // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels const auto x_offset = b_offset; const auto x_inc = b_inc; const auto x_size = (1 + (n - 1) * x_inc) + x_offset; auto x_buffer = Buffer(context_, x_size); b_buffer.CopyTo(queue_, x_size, x_buffer); // Fills the output buffer with zeros auto eventWaitList = std::vector(); auto fill_vector_event = Event(); FillVector(queue_, device_, program_, fill_vector_event.pointer(), eventWaitList, n, x_inc, x_offset, x_buffer, ConstantZero(), 16); fill_vector_event.WaitForCompletion(); // Derives properties based on the arguments const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) || (triangle == Triangle::kLower && a_transpose != Transpose::kNo)); const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) || (layout != Layout::kColMajor && a_transpose != Transpose::kNo)); // Loops over the blocks auto col = n; // the initial column position for (auto i = size_t{0}; i < n; i += db_["TRSV_BLOCK_SIZE"]) { const auto block_size = std::min(db_["TRSV_BLOCK_SIZE"], n - i); // Sets the next column position col = (is_upper) ? col - block_size : i; // Sets the offsets for upper or lower triangular const auto extra_offset_a = (is_transposed) ? (is_upper ? col + (col+block_size)*a_ld : col) : (is_upper ? col+block_size + col*a_ld : col*a_ld); const auto extra_offset_x = (is_upper) ? (col+block_size)*x_inc : 0; const auto extra_offset_b = col*x_inc; // Runs the GEMV routine to compute x' = A * x if (i > 0) { const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i; const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size; auto gemv_event = Event(); auto gemv = Xgemv(queue_, gemv_event.pointer()); gemv.DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne(), a_buffer, a_offset + extra_offset_a, a_ld, x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne(), x_buffer, x_offset + extra_offset_b, x_inc); gemv_event.WaitForCompletion(); } // Runs the triangular substitution for the block size auto sub_event = Event(); Substitution(layout, triangle, a_transpose, diagonal, block_size, a_buffer, a_offset + col + col*a_ld, a_ld, b_buffer, b_offset + col*b_inc, b_inc, x_buffer, x_offset + col*x_inc, x_inc, sub_event.pointer()); sub_event.WaitForCompletion(); } // Retrieves the results x_buffer.CopyToAsync(queue_, x_size, b_buffer, event_); } // ================================================================================================= // Compiles the templated class template class Xtrsv; template class Xtrsv; template class Xtrsv; template class Xtrsv; template class Xtrsv; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level2/xtrsv.hpp000066400000000000000000000047611463263031500205470ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular // forward and backward substitutions on the diagonal parts of the matrix in combination with larger // GEMV computation on the remainder of the matrix. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XTRSV_H_ #define CLBLAST_ROUTINES_XTRSV_H_ #include "routines/level2/xgemv.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xtrsv: public Xgemv { public: // Uses the generic matrix-vector routine using Xgemv::queue_; using Xgemv::context_; using Xgemv::device_; using Xgemv::db_; using Xgemv::program_; using Xgemv::event_; using Xgemv::DoGemv; // Constructor Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV"); // Templated-precision implementation of the routine void DoTrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); // Performs forward or backward substitution on a small triangular matrix void Substitution(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_inc, const Buffer &x_buffer, const size_t offset_x, const size_t x_inc, EventPointer event); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XTRSV_H_ #endif CLBlast-1.6.3/src/routines/level3/000077500000000000000000000000001463263031500166415ustar00rootroot00000000000000CLBlast-1.6.3/src/routines/level3/xgemm.cpp000066400000000000000000000364151463263031500204730ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xgemm class (see the header for information about the class). // // ================================================================================================= #include "routines/level3/xgemm.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"}, PrecisionValue(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" #include "../../kernels/level3/transpose_fast.opencl" #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/convert_symmetric.opencl" #include "../../kernels/level3/convert_triangular.opencl" #include "../../kernels/level3/convert_hermitian.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_direct_part1.opencl" #include "../../kernels/level3/xgemm_direct_part2.opencl" #include "../../kernels/level3/xgemm_direct_part3.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" }) { } // ================================================================================================= // The main routine template void Xgemm::DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const Buffer &temp_buffer, const bool temp_buffer_provided) { // optional arguments // Two methods to choose from, select which one to run const auto do_gemm_direct = UseDirectKernel(m, n, k, db_["XGEMM_MIN_INDIRECT_SIZE"]); const auto gemm_kernel_id = (do_gemm_direct) ? 0 : db_["GEMMK"]; // Computes the transpose/conjugate options and sets the a/b/c sizes based on that bool a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate; size_t a_one, a_two, b_one, b_two, c_one, c_two; ProcessArguments(layout, a_transpose, b_transpose, m, n, k, a_one, a_two, b_one, b_two, c_one, c_two, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, gemm_kernel_id); // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage // space. Also tests that the leading dimensions of: // matrix A cannot be less than K when rotated, or less than M when not-rotated // matrix B cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N when rotated, or less than M when not-rotated TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld); // Selects which version of GEMM to run if (do_gemm_direct) { // for small sizes (single kernel) GemmDirect(m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate); } else { // for larger sizes (pre/post-processing plus a very fast kernel) GemmIndirect(m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, a_one, a_two, b_one, b_two, c_one, c_two, temp_buffer, temp_buffer_provided); } } // ================================================================================================= // The indirect version of GEMM. This uses the faster but non-general kernel. It has specific // requirements, but several pre and post-processing kernels take care of those. However, the // overhead of these extra kernels might not be ideal for certain devices/arguments. template void Xgemm::GemmIndirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate, const size_t a_one, const size_t a_two, const size_t b_one, const size_t b_two, const size_t c_one, const size_t c_two, const Buffer &temp_buffer, const bool temp_buffer_provided) { // Calculates the ceiled versions of m, n, and k const auto global_divider_one = c_want_rotated_(db_["GEMMK"]) ? db_["NWG"] : db_["MWG"]; const auto global_divider_two = c_want_rotated_(db_["GEMMK"]) ? db_["MWG"] : db_["NWG"]; const auto m_ceiled = Ceil(m, global_divider_one); const auto n_ceiled = Ceil(n, global_divider_two); const auto k_ceiled = Ceil(k, db_["KWG"] * db_["KREG"]); // Computes the first and second "internal" (ceiled) dimensions of the 3 matrices taking into account // whether the matrices need to be rotated or not for the kernel. size_t a_one_i, a_two_i, b_one_i, b_two_i, c_one_i, c_two_i; CalculateInternalDimensions(m, n, k, db_["MWG"], db_["NWG"], db_["KWG"] * db_["KREG"], a_one_i, a_two_i, b_one_i, b_two_i, c_one_i, c_two_i, db_["GEMMK"]); // Determines whether or not temporary matrices are needed auto a_no_temp = NoTempBuffer(a_one, a_one_i, a_two, a_two_i, a_ld, a_offset, a_do_transpose, a_conjugate); auto b_no_temp = NoTempBuffer(b_one, b_one_i, b_two, b_two_i, b_ld, b_offset, b_do_transpose, b_conjugate); auto c_no_temp = NoTempBuffer(c_one, c_one_i, c_two, c_two_i, c_ld, c_offset, c_do_transpose, false); // Computes the sizes and offsets for (optional) temporary buffers for the 3 matrices auto b_temp_offset = size_t{0}; auto c_temp_offset = size_t{0}; const auto temp_size = ComputeTempSize(a_no_temp, b_no_temp, c_no_temp, a_one_i*a_two_i, b_one_i*b_two_i, c_one_i*c_two_i, b_temp_offset, c_temp_offset); if (!IsMultiple(b_temp_offset, db_["VWN"])) { throw BLASError(StatusCode::kUnexpectedError); } if (!IsMultiple(c_temp_offset, db_["VWM"])) { throw BLASError(StatusCode::kUnexpectedError); } // Creates the buffer for the (optional) temporary matrices. Note that we use 'a_buffer' in case // when no temporary buffer is needed, but that's just to make it compile: it is never used. const auto temp_buffer_all = (temp_buffer_provided) ? temp_buffer : ((temp_size > 0) ? Buffer(context_, temp_size) : a_buffer); // Verifies if the provided temporary buffer is large enough if (temp_buffer_provided) { const auto required_size = temp_size * sizeof(T); if (temp_buffer_all.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryTemp); } } // Sets the buffer pointers for (temp) matrices A, B, and C const auto a_temp = (a_no_temp) ? a_buffer : temp_buffer_all; const auto b_temp = (b_no_temp) ? b_buffer : temp_buffer_all; const auto c_temp = (c_no_temp) ? c_buffer : temp_buffer_all; // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto eventProcessA = Event(); PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, a_one_i, a_two_i, a_one_i, 0, a_temp, ConstantOne(), program_, true, a_do_transpose, a_conjugate); eventWaitList.push_back(eventProcessA); } // As above, but now for matrix B if (!b_no_temp) { auto eventProcessB = Event(); PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offset, b_buffer, b_one_i, b_two_i, b_one_i, b_temp_offset, b_temp, ConstantOne(), program_, true, b_do_transpose, b_conjugate); eventWaitList.push_back(eventProcessB); } // As above, but now for matrix C. This is only necessary if C is used both as input and output. if (!c_no_temp && beta != static_cast(0)) { auto eventProcessC = Event(); PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, c_one, c_two, c_ld, c_offset, c_buffer, c_one_i, c_two_i, c_one_i, c_temp_offset, c_temp, ConstantOne(), program_, true, c_do_transpose, false); eventWaitList.push_back(eventProcessC); } // Retrieves the Xgemm kernel from the compiled binary auto kernel = Kernel(program_, "Xgemm"); // Sets the kernel arguments kernel.SetArgument(0, static_cast(m_ceiled)); kernel.SetArgument(1, static_cast(n_ceiled)); kernel.SetArgument(2, static_cast(k_ceiled)); kernel.SetArgument(3, GetRealArg(alpha)); kernel.SetArgument(4, GetRealArg(beta)); kernel.SetArgument(5, a_temp()); kernel.SetArgument(6, b_temp()); kernel.SetArgument(7, c_temp()); kernel.SetArgument(8, static_cast(b_temp_offset / db_["VWN"])); kernel.SetArgument(9, static_cast(c_temp_offset / db_["VWM"])); // Computes the global and local thread sizes const auto global = std::vector{ (c_one_i * db_["MDIMC"]) / db_["MWG"], (c_two_i * db_["NDIMC"]) / db_["NWG"] }; const auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel auto eventKernel = Event(); auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList); // Runs the post-processing kernel if needed if (!c_no_temp) { eventWaitList.push_back(eventKernel); PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, c_one_i, c_two_i, c_one_i, c_temp_offset, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, ConstantOne(), program_, false, c_do_transpose, false); } } // ================================================================================================= // The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels. template void Xgemm::GemmDirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate) { // Retrieves the proper XgemmDirect kernel from the compiled binary const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); auto kernel = Kernel(program_, name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(m)); kernel.SetArgument(1, static_cast(n)); kernel.SetArgument(2, static_cast(k)); kernel.SetArgument(3, GetRealArg(alpha)); kernel.SetArgument(4, GetRealArg(beta)); kernel.SetArgument(5, a_buffer()); kernel.SetArgument(6, static_cast(a_offset)); kernel.SetArgument(7, static_cast(a_ld)); kernel.SetArgument(8, b_buffer()); kernel.SetArgument(9, static_cast(b_offset)); kernel.SetArgument(10, static_cast(b_ld)); kernel.SetArgument(11, c_buffer()); kernel.SetArgument(12, static_cast(c_offset)); kernel.SetArgument(13, static_cast(c_ld)); kernel.SetArgument(14, static_cast(c_do_transpose)); kernel.SetArgument(15, static_cast(a_conjugate)); kernel.SetArgument(16, static_cast(b_conjugate)); // Computes the global and local thread sizes const auto m_ceiled = Ceil(m, db_["WGD"]); const auto n_ceiled = Ceil(n, db_["WGD"]); const auto global = std::vector{ // CeilDiv(m * db_["MDIMCD"], db_["WGD"]), // CeilDiv(n * db_["NDIMCD"], db_["WGD"]) (m_ceiled * db_["MDIMCD"]) / db_["WGD"], (n_ceiled * db_["NDIMCD"]) / db_["WGD"] }; const auto local = std::vector{db_["MDIMCD"], db_["NDIMCD"]}; // Launches the kernel RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= // Compiles the templated class template class Xgemm; template class Xgemm; template class Xgemm; template class Xgemm; template class Xgemm; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level3/xgemm.hpp000066400000000000000000000260361463263031500204760ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xgemm routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XGEMM_H_ #define CLBLAST_ROUTINES_XGEMM_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xgemm: public Routine { public: // Defines the assumptions of the GEMM kernels static bool a_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; } static bool b_want_rotated_(const size_t) { return true; } static bool c_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; } // Computes the size of the temporary GEMM buffer based on user-arguments static size_t GetTempSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, const size_t mwg, const size_t nwg, const size_t kwg, const size_t gemm_kernel_id) { // Computes the transpose/conjugate options and sets the a/b/c sizes based on that bool a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate; size_t a_one, a_two, b_one, b_two, c_one, c_two; ProcessArguments(layout, a_transpose, b_transpose, m, n, k, a_one, a_two, b_one, b_two, c_one, c_two, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, gemm_kernel_id); // Computes the first and second "internal" (ceiled) dimensions of the 3 matrices taking into account // whether the matrices need to be rotated or not for the kernel. size_t a_one_i, a_two_i, b_one_i, b_two_i, c_one_i, c_two_i; CalculateInternalDimensions(m, n, k, mwg, nwg, kwg, a_one_i, a_two_i, b_one_i, b_two_i, c_one_i, c_two_i, gemm_kernel_id); // Determines whether or not temporary matrices are needed auto a_no_temp = NoTempBuffer(a_one, a_one_i, a_two, a_two_i, a_ld, a_offset, a_do_transpose, a_conjugate); auto b_no_temp = NoTempBuffer(b_one, b_one_i, b_two, b_two_i, b_ld, b_offset, b_do_transpose, b_conjugate); auto c_no_temp = NoTempBuffer(c_one, c_one_i, c_two, c_two_i, c_ld, c_offset, c_do_transpose, false); // Computes the sizes and offsets for (optional) temporary buffers for the 3 matrices auto b_temp_offset = size_t{0}; auto c_temp_offset = size_t{0}; return ComputeTempSize(a_no_temp, b_no_temp, c_no_temp, a_one_i*a_two_i, b_one_i*b_two_i, c_one_i*c_two_i, b_temp_offset, c_temp_offset); } // Selects which version of GEMM to run static bool UseDirectKernel(const size_t m, const size_t n, const size_t k, const size_t min_indirect_size) { const auto m_n_k = static_cast(m) * static_cast(n) * static_cast(k); const auto min_indirect_size_ll = static_cast(min_indirect_size); const auto min_indirect_size_e3 = min_indirect_size_ll * min_indirect_size_ll * min_indirect_size_ll; return (m_n_k < min_indirect_size_e3); } // Process the user-arguments, computes secondary parameters static void ProcessArguments(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, size_t& a_one, size_t& a_two, size_t& b_one, size_t& b_two, size_t& c_one, size_t& c_two, bool& a_do_transpose, bool& b_do_transpose, bool& c_do_transpose, bool& a_conjugate, bool& b_conjugate, const size_t gemm_kernel_id) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) || (k == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrices are transposed in memory. This is based on their layout // (row or column-major) and whether or not they are requested to be pre-transposed. Note // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of // col-major) to be transformed, so transposing requirements are not the same as whether or not // the matrix is actually transposed in memory. const auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); const auto c_rotated = (layout == Layout::kRowMajor); a_do_transpose = a_rotated != a_want_rotated_(gemm_kernel_id); b_do_transpose = b_rotated != b_want_rotated_(gemm_kernel_id); c_do_transpose = c_rotated != c_want_rotated_(gemm_kernel_id); // In case of complex data-types, the transpose can also become a conjugate transpose a_conjugate = (a_transpose == Transpose::kConjugate); b_conjugate = (b_transpose == Transpose::kConjugate); // Computes the first and second dimensions of the 3 matrices taking into account whether the // matrices are rotated or not a_one = (a_rotated) ? k : m; a_two = (a_rotated) ? m : k; b_one = (b_rotated) ? n : k; b_two = (b_rotated) ? k : n; c_one = (c_rotated) ? n : m; c_two = (c_rotated) ? m : n; } // Computes the sizes and offsets for (optional) temporary buffers for the 3 matrices static size_t ComputeTempSize(const bool a_no_temp, const bool b_no_temp, const bool c_no_temp, const size_t a_size, const size_t b_size, const size_t c_size, size_t &b_temp_offset, size_t &c_temp_offset) { auto temp_size = size_t{0}; if (!a_no_temp) { temp_size += a_size; } if (!b_no_temp) { b_temp_offset = temp_size; temp_size += b_size; } if (!c_no_temp) { c_temp_offset = temp_size; temp_size += c_size; } return temp_size; } // Determines whether or not temporary matrices are needed static bool NoTempBuffer(const size_t one, const size_t one_i, const size_t two, const size_t two_i, const size_t ld, const size_t offset, const bool do_transpose, const bool conjugate) { return one == one_i && two == two_i && ld == one && offset == 0 && !do_transpose && !conjugate; } // Computes the first and second "internal" (ceiled) dimensions of the 3 matrices taking into account // whether the matrices need to be rotated or not for the kernel. static void CalculateInternalDimensions(const size_t m, const size_t n, const size_t k, const size_t mwg, const size_t nwg, const size_t kwg, size_t& a_one_i, size_t& a_two_i, size_t& b_one_i, size_t& b_two_i, size_t& c_one_i, size_t& c_two_i, const size_t gemm_kernel_id) { const auto global_divider_one = c_want_rotated_(gemm_kernel_id) ? nwg : mwg; const auto global_divider_two = c_want_rotated_(gemm_kernel_id) ? mwg : nwg; const auto m_ceiled = Ceil(m, global_divider_one); const auto n_ceiled = Ceil(n, global_divider_two); const auto k_ceiled = Ceil(k, kwg); a_one_i = (a_want_rotated_(gemm_kernel_id)) ? k_ceiled : m_ceiled; a_two_i = (a_want_rotated_(gemm_kernel_id)) ? m_ceiled : k_ceiled; b_one_i = (b_want_rotated_(gemm_kernel_id)) ? n_ceiled : k_ceiled; b_two_i = (b_want_rotated_(gemm_kernel_id)) ? k_ceiled : n_ceiled; c_one_i = (c_want_rotated_(gemm_kernel_id)) ? n_ceiled : m_ceiled; c_two_i = (c_want_rotated_(gemm_kernel_id)) ? m_ceiled : n_ceiled; } // Constructor Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM"); // Templated-precision implementation of the routine void DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const Buffer &temp_buffer = Buffer(0), const bool temp_buffer_provided = false); // Indirect version of GEMM (with pre and post-processing kernels) void GemmIndirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate, const size_t a_one, const size_t a_two, const size_t b_one, const size_t b_two, const size_t c_one, const size_t c_two, const Buffer &temp_buffer, const bool temp_buffer_provided); // Direct version of GEMM (no pre and post-processing kernels) void GemmDirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XGEMM_H_ #endif CLBlast-1.6.3/src/routines/level3/xhemm.cpp000066400000000000000000000127751463263031500204770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhemm class (see the header for information about the class). // // ================================================================================================= #include "routines/level3/xhemm.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xhemm::Xhemm(Queue &queue, EventPointer event, const std::string &name): Xgemm(queue, event, name) { } // ================================================================================================= // The main routine template void Xhemm::DoHemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the // left) or B (on the right) in the Xgemm routine. auto k = (side == Side::kLeft) ? m : n; // Checks for validity of the squared A matrix TestMatrixA(k, k, a_buffer, a_offset, a_ld); // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared"; // Temporary buffer for a copy of the hermitian matrix auto temp_herm = Buffer(context_, k*k); // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm // routine afterwards auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the hermitian-to-squared kernel kernel.SetArgument(0, static_cast(k)); kernel.SetArgument(1, static_cast(a_ld)); kernel.SetArgument(2, static_cast(a_offset)); kernel.SetArgument(3, a_buffer()); kernel.SetArgument(4, static_cast(k)); kernel.SetArgument(5, static_cast(k)); kernel.SetArgument(6, static_cast(0)); kernel.SetArgument(7, temp_herm()); // Uses the common padding kernel's thread configuration. This is allowed, since the // hermitian-to-squared kernel uses the same parameters. auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; auto kernelEvent = Event(); RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); // Synchronize now: 'DoGemm' does not accept a list of events to wait for kernelEvent.WaitForCompletion(); // Runs the regular Xgemm code with either "C := AB+C" or ... if (side == Side::kLeft) { DoGemm(layout, Transpose::kNo, Transpose::kNo, m, n, k, alpha, temp_herm, 0, k, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld); } // ... with "C := BA+C". Note that A and B are now reversed. else { try { DoGemm(layout, Transpose::kNo, Transpose::kNo, m, n, k, alpha, b_buffer, b_offset, b_ld, temp_herm, 0, k, beta, c_buffer, c_offset, c_ld); } catch (BLASError &e) { // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine switch(e.status()) { case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details()); case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details()); case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details()); case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details()); case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details()); case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details()); default: throw; } } } } // ================================================================================================= // Compiles the templated class template class Xhemm; template class Xhemm; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level3/xhemm.hpp000066400000000000000000000037541463263031500205010ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhemm routine. It is based on the generalized matrix multiplication // routine (Xgemm). The implementation is very similar to the Xsymm routine. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XHEMM_H_ #define CLBLAST_ROUTINES_XHEMM_H_ #include "routines/level3/xgemm.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xhemm: public Xgemm { public: // Uses methods and variables the regular Xgemm routine using Xgemm::routine_name_; using Xgemm::queue_; using Xgemm::context_; using Xgemm::device_; using Xgemm::program_; using Xgemm::db_; using Xgemm::DoGemm; // Constructor Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM"); // Templated-precision implementation of the routine void DoHemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XHEMM_H_ #endif CLBlast-1.6.3/src/routines/level3/xher2k.cpp000066400000000000000000000055771463263031500205660ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xher2k class (see the header for information about the class). // // ================================================================================================= #include "routines/level3/xher2k.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xher2k::Xher2k(Queue &queue, EventPointer event, const std::string &name): Xherk(queue, event, name) { } // ================================================================================================= // The main routine template void Xher2k::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const U beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { // Runs the first matrix multiplication auto first_herk_event = Event(); auto complex_beta = T{beta, static_cast(0.0)}; const auto negated_ab_transpose = (ab_transpose != Transpose::kNo) ? Transpose::kNo : Transpose::kYes; HerkAB(layout, triangle, ab_transpose, negated_ab_transpose, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, complex_beta, c_buffer, c_offset, c_ld, first_herk_event.pointer(), false); ; first_herk_event.WaitForCompletion(); // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; auto complex_one = T{static_cast(1.0), static_cast(0.0)}; HerkAB(layout, triangle, ab_transpose, negated_ab_transpose, n, k, conjugate_alpha, b_buffer, b_offset, b_ld, a_buffer, a_offset, a_ld, complex_one, c_buffer, c_offset, c_ld, event_, true); } // ================================================================================================= // Compiles the templated class template class Xher2k; template class Xher2k; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level3/xher2k.hpp000066400000000000000000000037251463263031500205640ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xher2k routine. The precision is implemented using the template argument // 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the // Xsyr2k routine. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XHER2K_H_ #define CLBLAST_ROUTINES_XHER2K_H_ #include "routine.hpp" #include "routines/level3/xherk.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xher2k: public Xherk { public: // Uses methods and variables the regular Xherk routine using Xherk::event_; using Xherk::HerkAB; // Constructor Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K"); // Templated-precision implementation of the routine void DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const U beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XHER2K_H_ #endif CLBlast-1.6.3/src/routines/level3/xherk.cpp000066400000000000000000000233501463263031500204710ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xherk class (see the header for information about the class). // // ================================================================================================= #include "routines/level3/xherk.hpp" #include "routines/level3/xgemm.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xherk::Xherk(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" #include "../../kernels/level3/transpose_fast.opencl" #include "../../kernels/level3/transpose_pad.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" }) { } // ================================================================================================= // The main routine template void Xherk::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const U alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const U beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { const auto b_transpose = (a_transpose != Transpose::kNo) ? Transpose::kNo : Transpose::kYes; const auto b_buffer = a_buffer; const auto b_offset = a_offset; const auto b_ld = a_ld; const auto complex_alpha = T{alpha, static_cast(0.0)}; const auto complex_beta = T{beta, static_cast(0.0)}; HerkAB(layout, triangle, a_transpose, b_transpose, n, k, complex_alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, complex_beta, c_buffer, c_offset, c_ld, event_, true); } template void Xherk::HerkAB(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Transpose b_transpose, const size_t n, const size_t k, const T complex_alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T complex_beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, EventPointer final_event, const bool diagonal_to_zero) { // Computes the transpose/conjugate options and sets the a/b/c sizes based on that bool a_do_transpose, b_do_transpose, c_do_transpose, dummy1, dummy2; size_t a_one, a_two, b_one, b_two, c_one, c_two; Xgemm::ProcessArguments(layout, a_transpose, b_transpose, n, n, k, a_one, a_two, b_one, b_two, c_one, c_two, a_do_transpose, b_do_transpose, c_do_transpose, dummy1, dummy2, db_["GEMMK"]); // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or // to matrix A (argument: conjugate transpose) auto a_conjugate = (a_transpose != Transpose::kNo); auto b_conjugate = (b_transpose != Transpose::kNo); // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage // space. Also tests that the leading dimensions of: // matrix A cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); auto k_ceiled = Ceil(k, db_["KWG"] * db_["KREG"]); // Computes the first and second "internal" (ceiled) dimensions of the 3 matrices taking into account // whether the matrices need to be rotated or not for the kernel. const auto a_one_i = (Xgemm::a_want_rotated_(db_["GEMMK"])) ? k_ceiled : n_ceiled; const auto a_two_i = (Xgemm::a_want_rotated_(db_["GEMMK"])) ? n_ceiled : k_ceiled; const auto b_one_i = (!Xgemm::b_want_rotated_(db_["GEMMK"])) ? k_ceiled : n_ceiled; const auto b_two_i = (!Xgemm::b_want_rotated_(db_["GEMMK"])) ? n_ceiled : k_ceiled; // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; // Determines whether or not temporary matrices are needed const auto a_no_temp = Xgemm::NoTempBuffer(a_one, a_one_i, a_two, a_two_i, a_ld, a_offset, a_do_transpose, a_conjugate); const auto b_no_temp = Xgemm::NoTempBuffer(b_one, b_one_i, b_two, b_two_i, b_ld, b_offset, b_do_transpose, b_conjugate); // Creates the temporary matrices auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, a_one_i * a_two_i); auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, b_one_i * b_two_i); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. Two copies are created. if (!a_no_temp) { auto eventProcessA = Event(); PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, a_one_i, a_two_i, a_one_i, 0, a_temp, ConstantOne(), program_, true, a_do_transpose, a_conjugate); eventWaitList.push_back(eventProcessA); } if (!b_no_temp) { auto eventProcessB = Event(); PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offset, b_buffer, b_one_i, b_two_i, b_one_i, 0, b_temp, ConstantOne(), program_, true, b_do_transpose, b_conjugate); eventWaitList.push_back(eventProcessB); } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne(), program_, true, c_do_transpose, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); kernel.SetArgument(2, GetRealArg(complex_alpha)); kernel.SetArgument(3, GetRealArg(complex_beta)); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, b_temp()); kernel.SetArgument(6, c_temp()); // Computes the global and local thread sizes auto global = std::vector{ (n_ceiled * db_["MDIMC"]) / db_["MWG"], (n_ceiled * db_["NDIMC"]) / db_["NWG"] }; auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel auto eventKernel = Event(); RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); eventWaitList.push_back(eventKernel); // Runs the post-processing kernel const auto upper = Xgemm::c_want_rotated_(db_["GEMMK"]) ? (triangle == Triangle::kLower) : (triangle == Triangle::kUpper); const auto lower = !upper; PadCopyTransposeMatrix(queue_, device_, db_, final_event, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne(), program_, false, c_do_transpose, false, upper, lower, diagonal_to_zero); } // ================================================================================================= // Compiles the templated class template class Xherk; template class Xherk; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level3/xherk.hpp000066400000000000000000000044561463263031500205040ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xherk routine. The precision is implemented using the template argument // 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the // Xsyrk routine. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XHERK_H_ #define CLBLAST_ROUTINES_XHERK_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xherk: public Routine { public: // Constructor Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK"); // Templated-precision implementation of the routine void DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const U alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const U beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); // Helper function to be reused for HER2K void HerkAB(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Transpose b_transpose, const size_t n, const size_t k, const T complex_alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T complex_beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, EventPointer final_event, const bool diagonal_to_zero); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XHERK_H_ #endif CLBlast-1.6.3/src/routines/level3/xsymm.cpp000066400000000000000000000130601463263031500205220ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsymm class (see the header for information about the class). // // ================================================================================================= #include "routines/level3/xsymm.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xsymm::Xsymm(Queue &queue, EventPointer event, const std::string &name): Xgemm(queue, event, name) { } // ================================================================================================= // The main routine template void Xsymm::DoSymm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the // left) or B (on the right) in the Xgemm routine. auto k = (side == Side::kLeft) ? m : n; // Checks for validity of the squared A matrix TestMatrixA(k, k, a_buffer, a_offset, a_ld); // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; // Temporary buffer for a copy of the symmetric matrix auto temp_symm = Buffer(context_, k*k); // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm // routine afterwards auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the symmetric-to-squared kernel kernel.SetArgument(0, static_cast(k)); kernel.SetArgument(1, static_cast(a_ld)); kernel.SetArgument(2, static_cast(a_offset)); kernel.SetArgument(3, a_buffer()); kernel.SetArgument(4, static_cast(k)); kernel.SetArgument(5, static_cast(k)); kernel.SetArgument(6, static_cast(0)); kernel.SetArgument(7, temp_symm()); // Uses the common padding kernel's thread configuration. This is allowed, since the // symmetric-to-squared kernel uses the same parameters. auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; auto kernelEvent = Event(); RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); // Synchronize now: 'DoGemm' does not accept a list of events to wait for kernelEvent.WaitForCompletion(); // Runs the regular Xgemm code with either "C := AB+C" or ... if (side == Side::kLeft) { DoGemm(layout, Transpose::kNo, Transpose::kNo, m, n, k, alpha, temp_symm, 0, k, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld); } // ... with "C := BA+C". Note that A and B are now reversed. else { try { DoGemm(layout, Transpose::kNo, Transpose::kNo, m, n, k, alpha, b_buffer, b_offset, b_ld, temp_symm, 0, k, beta, c_buffer, c_offset, c_ld); } catch (BLASError &e) { // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine switch(e.status()) { case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details()); case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details()); case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details()); case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details()); case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details()); case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details()); default: throw; } } } } // ================================================================================================= // Compiles the templated class template class Xsymm; template class Xsymm; template class Xsymm; template class Xsymm; template class Xsymm; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level3/xsymm.hpp000066400000000000000000000042641463263031500205350ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsymm routine. It is based on the generalized matrix multiplication // routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the // "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by // transforming it into a general matrix, and then calls the regular GEMM code. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSYMM_H_ #define CLBLAST_ROUTINES_XSYMM_H_ #include "routines/level3/xgemm.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xsymm: public Xgemm { public: // Uses methods and variables the regular Xgemm routine using Xgemm::routine_name_; using Xgemm::queue_; using Xgemm::context_; using Xgemm::device_; using Xgemm::program_; using Xgemm::db_; using Xgemm::DoGemm; // Constructor Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM"); // Templated-precision implementation of the routine void DoSymm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSYMM_H_ #endif CLBlast-1.6.3/src/routines/level3/xsyr2k.cpp000066400000000000000000000053771463263031500206230ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsyr2k class (see the header for information about the class). // // ================================================================================================= #include "routines/level3/xsyr2k.hpp" #include "routines/level3/xgemm.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xsyr2k::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): Xsyrk(queue, event, name) { } // ================================================================================================= // The main routine template void Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { // Runs the first matrix multiplication auto first_syrk_event = Event(); const auto negated_ab_transpose = (ab_transpose != Transpose::kNo) ? Transpose::kNo : Transpose::kYes; SyrkAB(layout, triangle, ab_transpose, negated_ab_transpose, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, first_syrk_event.pointer()); ; first_syrk_event.WaitForCompletion(); // Swaps the arguments for matrices A and B, and sets 'beta' to 1 auto one = ConstantOne(); SyrkAB(layout, triangle, ab_transpose, negated_ab_transpose, n, k, alpha, b_buffer, b_offset, b_ld, a_buffer, a_offset, a_ld, one, c_buffer, c_offset, c_ld, event_); } // ================================================================================================= // Compiles the templated class template class Xsyr2k; template class Xsyr2k; template class Xsyr2k; template class Xsyr2k; template class Xsyr2k; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level3/xsyr2k.hpp000066400000000000000000000040031463263031500206110ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsyr2k routine. The precision is implemented using a template argument. // The implementation is very similar to Xsyrk (see header for details), except for the fact that // the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSYR2K_H_ #define CLBLAST_ROUTINES_XSYR2K_H_ #include "routine.hpp" #include "routines/level3/xsyrk.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xsyr2k: public Xsyrk { public: // Uses methods and variables the regular Xsyrk routine using Xsyrk::event_; using Xsyrk::SyrkAB; // Constructor Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K"); // Templated-precision implementation of the routine void DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSYR2K_H_ #endif CLBlast-1.6.3/src/routines/level3/xsyrk.cpp000066400000000000000000000224301463263031500205260ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsyrk class (see the header for information about the class). // // ================================================================================================= #include "routines/level3/xsyrk.hpp" #include "routines/level3/xgemm.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xsyrk::Xsyrk(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" #include "../../kernels/level3/transpose_fast.opencl" #include "../../kernels/level3/transpose_pad.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" }) { } // ================================================================================================= // The main routine template void Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { const auto b_transpose = (a_transpose != Transpose::kNo) ? Transpose::kNo : Transpose::kYes; const auto b_buffer = a_buffer; const auto b_offset = a_offset; const auto b_ld = a_ld; SyrkAB(layout, triangle, a_transpose, b_transpose, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, event_); } template void Xsyrk::SyrkAB(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Transpose b_transpose, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, EventPointer final_event) { // Computes the transpose/conjugate options and sets the a/b/c sizes based on that bool a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate; size_t a_one, a_two, b_one, b_two, c_one, c_two; Xgemm::ProcessArguments(layout, a_transpose, b_transpose, n, n, k, a_one, a_two, b_one, b_two, c_one, c_two, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, db_["GEMMK"]); // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage // space. Also tests that the leading dimensions of: // matrix A cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); auto k_ceiled = Ceil(k, db_["KWG"] * db_["KREG"]); // Computes the first and second "internal" (ceiled) dimensions of the 3 matrices taking into account // whether the matrices need to be rotated or not for the kernel. const auto a_one_i = (Xgemm::a_want_rotated_(db_["GEMMK"])) ? k_ceiled : n_ceiled; const auto a_two_i = (Xgemm::a_want_rotated_(db_["GEMMK"])) ? n_ceiled : k_ceiled; const auto b_one_i = (!Xgemm::b_want_rotated_(db_["GEMMK"])) ? k_ceiled : n_ceiled; const auto b_two_i = (!Xgemm::b_want_rotated_(db_["GEMMK"])) ? n_ceiled : k_ceiled; // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; // Determines whether or not temporary matrices are needed const auto a_no_temp = Xgemm::NoTempBuffer(a_one, a_one_i, a_two, a_two_i, a_ld, a_offset, a_do_transpose, a_conjugate); const auto b_no_temp = Xgemm::NoTempBuffer(b_one, b_one_i, b_two, b_two_i, b_ld, b_offset, b_do_transpose, b_conjugate); // Creates the temporary matrices auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, a_one_i * a_two_i); auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, b_one_i * b_two_i); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto eventProcessA = Event(); PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, a_one_i, a_two_i, a_one_i, 0, a_temp, ConstantOne(), program_, true, a_do_transpose, false); eventWaitList.push_back(eventProcessA); } if (!b_no_temp) { auto eventProcessB = Event(); PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offset, b_buffer, b_one_i, b_two_i, b_one_i, 0, b_temp, ConstantOne(), program_, true, b_do_transpose, false); eventWaitList.push_back(eventProcessB); } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne(), program_, true, c_do_transpose, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); kernel.SetArgument(2, GetRealArg(alpha)); kernel.SetArgument(3, GetRealArg(beta)); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, b_temp()); kernel.SetArgument(6, c_temp()); // Computes the global and local thread sizes auto global = std::vector{ (n_ceiled * db_["MDIMC"]) / db_["MWG"], (n_ceiled * db_["NDIMC"]) / db_["NWG"] }; auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel auto eventKernel = Event(); RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); eventWaitList.push_back(eventKernel); // Runs the post-processing kernel const auto upper = Xgemm::c_want_rotated_(db_["GEMMK"]) ? (triangle == Triangle::kLower) : (triangle == Triangle::kUpper); const auto lower = !upper; PadCopyTransposeMatrix(queue_, device_, db_, final_event, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne(), program_, false, c_do_transpose, false, upper, lower, false); } // ================================================================================================= // Compiles the templated class template class Xsyrk; template class Xsyrk; template class Xsyrk; template class Xsyrk; template class Xsyrk; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level3/xsyrk.hpp000066400000000000000000000047551463263031500205450ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xsyrk routine. The precision is implemented using a template argument. // The implementation is based on the regular Xgemm routine and kernel, but with two main changes: // 1) The final unpad(transpose) kernel updates only the upper/lower triangular part. // 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for // performance reasons, as the actual masking is done later (see the first point). // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XSYRK_H_ #define CLBLAST_ROUTINES_XSYRK_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xsyrk: public Routine { public: // Constructor Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK"); // Templated-precision implementation of the routine void DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); // Helper function to be reused for SYR2K void SyrkAB(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Transpose b_transpose, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, EventPointer final_event); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XSYRK_H_ #endif CLBlast-1.6.3/src/routines/level3/xtrmm.cpp000066400000000000000000000143331463263031500205200ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xtrmm class (see the header for information about the class). // // ================================================================================================= #include "routines/level3/xtrmm.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xtrmm::Xtrmm(Queue &queue, EventPointer event, const std::string &name): Xgemm(queue, event, name) { } // ================================================================================================= // The main routine template void Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Computes the k dimension. This is based on whether or not matrix is A (on the left) // or B (on the right) in the Xgemm routine. auto k = (side == Side::kLeft) ? m : n; // Checks for validity of the triangular A matrix TestMatrixA(k, k, a_buffer, a_offset, a_ld); // Checks for validity of the input/output B matrix const auto b_one = (layout == Layout::kRowMajor) ? n : m; const auto b_two = (layout == Layout::kRowMajor) ? m : n; TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); // Creates a copy of B to avoid overwriting input in GEMM while computing output const auto b_size = (b_ld * (b_two - 1) + b_one + b_offset); auto b_buffer_copy = Buffer(context_, b_size); b_buffer.CopyTo(queue_, b_size, b_buffer_copy); // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); auto kernel_name = (is_upper) ? "TriaUpperToSquared" : "TriaLowerToSquared"; // Determines whether or not the triangular matrix is unit-diagonal auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false; // Temporary buffer for a copy of the triangular matrix auto temp_triangular = Buffer(context_, k*k); // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm // routine afterwards auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the triangular-to-squared kernel kernel.SetArgument(0, static_cast(k)); kernel.SetArgument(1, static_cast(a_ld)); kernel.SetArgument(2, static_cast(a_offset)); kernel.SetArgument(3, a_buffer()); kernel.SetArgument(4, static_cast(k)); kernel.SetArgument(5, static_cast(k)); kernel.SetArgument(6, static_cast(0)); kernel.SetArgument(7, temp_triangular()); kernel.SetArgument(8, static_cast(unit_diagonal)); // Uses the common padding kernel's thread configuration. This is allowed, since the // triangular-to-squared kernel uses the same parameters. auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; auto kernelEvent = Event(); RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); // Synchronize now: 'DoGemm' does not accept a list of events to wait for kernelEvent.WaitForCompletion(); // Runs the regular Xgemm code with either "B := alpha*A*B" or ... if (side == Side::kLeft) { DoGemm(layout, a_transpose, Transpose::kNo, m, n, k, alpha, temp_triangular, 0, k, b_buffer_copy, b_offset, b_ld, ConstantZero(), b_buffer, b_offset, b_ld); } // ... with "B := alpha*B*A". Note that A and B are now reversed. else { try { DoGemm(layout, Transpose::kNo, a_transpose, m, n, k, alpha, b_buffer_copy, b_offset, b_ld, temp_triangular, 0, k, ConstantZero(), b_buffer, b_offset, b_ld); } catch (BLASError &e) { // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine switch(e.status()) { case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details()); case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details()); case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details()); case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details()); case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details()); case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details()); default: throw; } } } } // ================================================================================================= // Compiles the templated class template class Xtrmm; template class Xtrmm; template class Xtrmm; template class Xtrmm; template class Xtrmm; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level3/xtrmm.hpp000066400000000000000000000040251463263031500205220ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xtrmm routine. The implementation is based on first transforming the // upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM // routine. Therefore, this class inherits from the Xgemm class. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XTRMM_H_ #define CLBLAST_ROUTINES_XTRMM_H_ #include "routines/level3/xgemm.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xtrmm: public Xgemm { public: // Uses methods and variables the regular Xgemm routine using Xgemm::routine_name_; using Xgemm::queue_; using Xgemm::context_; using Xgemm::device_; using Xgemm::program_; using Xgemm::db_; using Xgemm::DoGemm; // Constructor Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM"); // Templated-precision implementation of the routine void DoTrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XTRMM_H_ #endif CLBlast-1.6.3/src/routines/level3/xtrsm.cpp000066400000000000000000000276351463263031500205370ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based // on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular // Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, // and Jack Dongarra and the OpenCL implementation in clBLAS. // // ================================================================================================= #include "routines/level3/xtrsm.hpp" #include "routines/levelx/xinvert.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xtrsm::Xtrsm(Queue &queue, EventPointer event, const std::string &name): Xgemm(queue, event, name) { } // ================================================================================================= // The entry point: transforming into col-major (if needed) and then running the col-major version template void Xtrsm::DoTrsm(const Layout layout, Side side, Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, size_t m, size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { // Converts row-major to a col-major problem: // The idea is that // B = A*X // can be computed as // B' = (A*X)' = X'*A' // Since changing the order is basically a transpose on each matrix, the formula becomes: // B = X*A // So only the side (left/right) and the triangle (upper/lower) are changed and M/N are swapped if (layout == Layout::kRowMajor) { std::swap(m, n); side = (side == Side::kLeft) ? Side::kRight : Side::kLeft; triangle = (triangle == Triangle::kLower) ? Triangle::kUpper : Triangle::kLower; } // Runs the col-major version of TRSM TrsmColMajor(side, triangle, a_transpose, diagonal, m, n, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld); } // ================================================================================================= // The main routine template void Xtrsm::TrsmColMajor(const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { // Settings constexpr auto block_size = size_t{16}; // tuneable // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Some parts of this kernel are not tunable and thus require some minimal OpenCL properties if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16 throw RuntimeErrorCode(StatusCode::kNotImplemented); } // Computes the k dimension. This is based on whether or not matrix is A (on the left) // or B (on the right) in the Xgemm routine. const auto k = (side == Side::kLeft) ? m : n; // Checks for validity of the triangular A matrix TestMatrixA(k, k, a_buffer, a_offset, a_ld); // Checks for validity of the input B matrix TestMatrixB(m, n, b_buffer, b_offset, b_ld); // Creates a copy of B to avoid overwriting input in GEMM while computing output const auto b_size = b_ld * (n - 1) + m + b_offset; const auto x_one = m; const auto x_two = n; const auto x_size = b_size; const auto x_ld = b_ld; const auto x_offset = b_offset; auto x_buffer = Buffer(context_, x_size); b_buffer.CopyTo(queue_, x_size, x_buffer); // Temporary buffer for the inverse of the A matrix const auto a_inv_size = Ceil(k, block_size) * block_size; auto a_inv_buffer = Buffer(context_, a_inv_size); // Fills the output buffer with zeros auto eventWaitList = std::vector(); auto fill_matrix_event = Event(); FillMatrix(queue_, device_, program_, fill_matrix_event.pointer(), eventWaitList, x_one, x_two, x_ld, x_offset, x_buffer, ConstantZero(), 16); fill_matrix_event.WaitForCompletion(); // Inverts the diagonal blocks auto diagonal_invert_event = Event(); auto inverter = Xinvert(queue_, diagonal_invert_event.pointer()); inverter.InvertMatrixDiagonalBlocks(Layout::kColMajor, triangle, diagonal, k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer); diagonal_invert_event.WaitForCompletion(); // Derives properties based on the arguments const auto condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) || (triangle == Triangle::kLower && a_transpose == Transpose::kNo)); // Left side if (side == Side::kLeft) { // True when (lower triangular) or (upper triangular and transposed) if (condition) { for (auto i = size_t{0}; i < m; i += block_size) { const auto gemm_alpha = (i == 0) ? alpha : ConstantOne(); const auto current_block_size = std::min(m - i, block_size); auto gemm1_event = Event(); auto gemm1 = Xgemm(queue_, gemm1_event.pointer()); gemm1.DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, current_block_size, n, current_block_size, gemm_alpha, a_inv_buffer, i * block_size, block_size, b_buffer, b_offset + i, b_ld, ConstantZero(), x_buffer, x_offset + i, x_ld); gemm1_event.WaitForCompletion(); if (i + block_size >= m) { break; } const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld; auto gemm2_event = Event(); auto gemm2 = Xgemm(queue_, gemm2_event.pointer()); gemm2.DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, m - i - block_size, n, block_size, ConstantNegOne(), a_buffer, this_a_offset + a_offset, a_ld, x_buffer, x_offset + i, x_ld, gemm_alpha, b_buffer, b_offset + i + block_size, b_ld); gemm2_event.WaitForCompletion(); } } // True when (upper triangular) or (lower triangular and transposed) else { const auto special_block_size = (m % block_size == 0) ? block_size : (m % block_size); const auto i_start = static_cast(m) - static_cast(special_block_size); for (auto i = i_start; i >= 0; i -= static_cast(block_size)) { const auto current_block_size = (i == i_start) ? special_block_size : block_size; const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne(); auto gemm1_event = Event(); auto gemm1 = Xgemm(queue_, gemm1_event.pointer()); gemm1.DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, current_block_size, n, current_block_size, gemm_alpha, a_inv_buffer, i * block_size, block_size, b_buffer, b_offset + i, b_ld, ConstantZero(), x_buffer, x_offset + i, x_ld); gemm1_event.WaitForCompletion(); if (i - static_cast(block_size) < 0) { break; } const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i; auto gemm2_event = Event(); auto gemm2 = Xgemm(queue_, gemm2_event.pointer()); gemm2.DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, i, n, current_block_size, ConstantNegOne(), a_buffer, this_a_offset + a_offset, a_ld, x_buffer, x_offset + i, x_ld, gemm_alpha, b_buffer, b_offset, b_ld); gemm2_event.WaitForCompletion(); } } } // Right side else { // True when (lower triangular) or (upper triangular and transposed) if (condition) { const auto special_block_size = (n % block_size == 0) ? block_size : (n % block_size); const auto i_start = static_cast(n) - static_cast(special_block_size); for (auto i = i_start; i >= 0; i -= static_cast(block_size)) { const auto current_block_size = (i == i_start) ? special_block_size : block_size; const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne(); auto gemm1_event = Event(); auto gemm1 = Xgemm(queue_, gemm1_event.pointer()); gemm1.DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, m, current_block_size, current_block_size, gemm_alpha, b_buffer, b_offset + i * b_ld, b_ld, a_inv_buffer, i * block_size, block_size, ConstantZero(), x_buffer, x_offset + i * x_ld, x_ld); gemm1_event.WaitForCompletion(); if (i - static_cast(block_size) < 0) { break; } const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld; auto gemm2_event = Event(); auto gemm2 = Xgemm(queue_, gemm2_event.pointer()); gemm2.DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, m, i, current_block_size, ConstantNegOne(), x_buffer, x_offset + i * x_ld, x_ld, a_buffer, this_a_offset + a_offset, a_ld, gemm_alpha, b_buffer, b_offset, b_ld); gemm2_event.WaitForCompletion(); } } // True when (upper triangular) or (lower triangular and transposed) else { for (auto i = size_t{0}; i < n; i += block_size) { const auto gemm_alpha = (i == 0) ? alpha : ConstantOne(); const auto current_block_size = std::min(n - i, block_size); auto gemm1_event = Event(); auto gemm1 = Xgemm(queue_, gemm1_event.pointer()); gemm1.DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, m, current_block_size, current_block_size, gemm_alpha, b_buffer, b_offset + i * b_ld, b_ld, a_inv_buffer, i * block_size, block_size, ConstantZero(), x_buffer, x_offset + i * x_ld, x_ld); gemm1_event.WaitForCompletion(); if (i + block_size >= n) { break; } const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld; auto gemm2_event = Event(); auto gemm2 = Xgemm(queue_, gemm2_event.pointer()); gemm2.DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, m, n - i - block_size, block_size, ConstantNegOne(), x_buffer, x_offset + i * x_ld, x_ld, a_buffer, this_a_offset + a_offset, a_ld, gemm_alpha, b_buffer, b_offset + (i + block_size) * b_ld, b_ld); gemm2_event.WaitForCompletion(); } } } // Retrieves the results x_buffer.CopyToAsync(queue_, b_size, b_buffer, event_); } // ================================================================================================= // Compiles the templated class template class Xtrsm; template class Xtrsm; template class Xtrsm; template class Xtrsm; template class Xtrsm; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/level3/xtrsm.hpp000066400000000000000000000044761463263031500205420ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xtrsm routine. The implementation is based on ??? (TODO). // Therefore, this class inherits from the Xgemm class. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XTRSM_H_ #define CLBLAST_ROUTINES_XTRSM_H_ #include "routines/level3/xgemm.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xtrsm: public Xgemm { public: // Uses methods and variables the Xgemm routine using Xgemm::queue_; using Xgemm::context_; using Xgemm::device_; using Xgemm::db_; using Xgemm::program_; using Xgemm::event_; using Xgemm::DoGemm; // Constructor Xtrsm(Queue &queue, EventPointer event, const std::string &name = "TRSM"); // Templated-precision implementation of the routine void DoTrsm(const Layout layout, Side side, Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, size_t m, size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld); // Implementation of the column-major version void TrsmColMajor(const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XTRSM_H_ #endif CLBlast-1.6.3/src/routines/levelx/000077500000000000000000000000001463263031500167465ustar00rootroot00000000000000CLBlast-1.6.3/src/routines/levelx/xaxpybatched.cpp000066400000000000000000000102021463263031500221310ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the XaxpyBatched class (see the header for information about the class). // // ================================================================================================= #include "routines/levelx/xaxpybatched.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template XaxpyBatched::XaxpyBatched(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xaxpy"}, PrecisionValue(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xaxpy.opencl" }) { } // ================================================================================================= // The main routine template void XaxpyBatched::DoAxpyBatched(const size_t n, const std::vector &alphas, const Buffer &x_buffer, const std::vector &x_offsets, const size_t x_inc, const Buffer &y_buffer, const std::vector &y_offsets, const size_t y_inc, const size_t batch_count) { // Tests for a valid batch count if ((batch_count < 1) || (alphas.size() != batch_count) || (x_offsets.size() != batch_count) || (y_offsets.size() != batch_count)) { throw BLASError(StatusCode::kInvalidBatchCount); } // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity for (auto batch = size_t{0}; batch < batch_count; ++batch) { TestVectorX(n, x_buffer, x_offsets[batch], x_inc); TestVectorY(n, y_buffer, y_offsets[batch], y_inc); } // Upload the arguments to the device auto x_offsets_int = std::vector(batch_count); auto y_offsets_int = std::vector(batch_count); for (auto batch = size_t{ 0 }; batch < batch_count; ++batch) { x_offsets_int[batch] = static_cast(x_offsets[batch]); y_offsets_int[batch] = static_cast(y_offsets[batch]); } auto x_offsets_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); auto y_offsets_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); auto alphas_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); x_offsets_device.Write(queue_, batch_count, x_offsets_int); y_offsets_device.Write(queue_, batch_count, y_offsets_int); alphas_device.Write(queue_, batch_count, alphas); // Retrieves the Xaxpy kernel from the compiled binary auto kernel = Kernel(program_, "XaxpyBatched"); // Sets the kernel arguments kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, alphas_device()); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, x_offsets_device()); kernel.SetArgument(4, static_cast(x_inc)); kernel.SetArgument(5, y_buffer()); kernel.SetArgument(6, y_offsets_device()); kernel.SetArgument(7, static_cast(y_inc)); // Launches the kernel auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"], batch_count}; auto local = std::vector{db_["WGS"], 1}; RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= // Compiles the templated class template class XaxpyBatched; template class XaxpyBatched; template class XaxpyBatched; template class XaxpyBatched; template class XaxpyBatched; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/levelx/xaxpybatched.hpp000066400000000000000000000031521463263031500221440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the XaxpyBatched routine. This is a non-blas batched version of AXPY. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XAXPYBATCHED_H_ #define CLBLAST_ROUTINES_XAXPYBATCHED_H_ #include #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class XaxpyBatched: public Routine { public: // Constructor XaxpyBatched(Queue &queue, EventPointer event, const std::string &name = "AXPYBATCHED"); // Templated-precision implementation of the routine void DoAxpyBatched(const size_t n, const std::vector &alphas, const Buffer &x_buffer, const std::vector &x_offsets, const size_t x_inc, const Buffer &y_buffer, const std::vector &y_offsets, const size_t y_inc, const size_t batch_count); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XAXPYBATCHED_H_ #endif CLBlast-1.6.3/src/routines/levelx/xcol2im.cpp000066400000000000000000000114711463263031500210330ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xcol2im class (see the header for information about the class). // // ================================================================================================= #include "routines/levelx/xcol2im.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xcol2im::Xcol2im(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy"}, PrecisionValue(), {}, { #include "../../kernels/levelx/col2im.opencl" }) { } // ================================================================================================= // The main routine template void Xcol2im::DoCol2im(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const Buffer &col_buffer, const size_t col_offset, const Buffer &im_buffer, const size_t im_offset) { // Flip the output along kernel_h and kernel_w, or not. const auto kernel_name = (kernel_mode == KernelMode::kConvolution) ? "Xcol2imKernelFlip" : "Xcol2imKernelNormal"; // Makes sure all dimensions are larger than zero if ((channels == 0) || (height == 0) || (width == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Sets the output height and width const auto size_h = height + 2 * pad_h; const auto padding_h = dilation_h * (kernel_h - 1) + 1; const auto col_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1; const auto size_w = width + 2 * pad_w; const auto padding_w = dilation_w * (kernel_w - 1) + 1; const auto col_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1; int stride_bez_h = 0; int stride_bez_w = 0; int dilation_bez_h = 0; int dilation_bez_w = 0; int gcd_h = 0; int gcd_w = 0; EuclidGCD(static_cast(stride_h), static_cast(dilation_h), stride_bez_h, dilation_bez_h, gcd_h); EuclidGCD(static_cast(stride_w), static_cast(dilation_w), stride_bez_w, dilation_bez_w, gcd_w); // Retrieves the kernel from the compiled binary auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(height)); kernel.SetArgument(1, static_cast(width)); kernel.SetArgument(2, static_cast(channels)); kernel.SetArgument(3, static_cast(col_h)); kernel.SetArgument(4, static_cast(col_w)); kernel.SetArgument(5, static_cast(kernel_h)); kernel.SetArgument(6, static_cast(kernel_w)); kernel.SetArgument(7, static_cast(pad_h)); kernel.SetArgument(8, static_cast(pad_w)); kernel.SetArgument(9, static_cast(stride_h)); kernel.SetArgument(10, static_cast(stride_w)); kernel.SetArgument(11, static_cast(dilation_h)); kernel.SetArgument(12, static_cast(dilation_w)); kernel.SetArgument(13, stride_bez_h); kernel.SetArgument(14, stride_bez_w); kernel.SetArgument(15, dilation_bez_h); kernel.SetArgument(16, dilation_bez_w); kernel.SetArgument(17, gcd_h); kernel.SetArgument(18, gcd_w); kernel.SetArgument(19, col_buffer()); kernel.SetArgument(20, static_cast(col_offset)); kernel.SetArgument(21, im_buffer()); kernel.SetArgument(22, static_cast(im_offset)); // Launches the kernel const auto w_ceiled = Ceil((width - 1) / gcd_w + 1, db_["COPY_DIMX"]); const auto h_ceiled = Ceil((height - 1) / gcd_h + 1, db_["COPY_DIMY"]); const auto global = std::vector{w_ceiled, h_ceiled * channels}; const auto local = std::vector{db_["COPY_DIMX"], db_["COPY_DIMY"]}; RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= // Compiles the templated class template class Xcol2im; template class Xcol2im; template class Xcol2im; template class Xcol2im; template class Xcol2im; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/levelx/xcol2im.hpp000066400000000000000000000034641463263031500210430ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xcol2im routine. The precision is implemented using a template argument. // Uses the tuning parameters from the regular copy kernel. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XCOL2IM_H_ #define CLBLAST_ROUTINES_XCOL2IM_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xcol2im: public Routine { public: // Constructor Xcol2im(Queue &queue, EventPointer event, const std::string &name = "COL2IM"); // Templated-precision implementation of the routine void DoCol2im(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const Buffer &col_buffer, const size_t col_offset, const Buffer &im_buffer, const size_t im_offset); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XCOL2IM_H_ #endif CLBlast-1.6.3/src/routines/levelx/xconvgemm.cpp000066400000000000000000000201461463263031500214600ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xconvgemm class (see the header for information about the class). // // ================================================================================================= #include #include #include #include "routines/levelx/xconvgemm.hpp" #include "routines/levelx/xim2col.hpp" namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xconvgemm::Xconvgemm(Queue &queue, EventPointer event, const std::string &name, const ConvGemmMethod method): Routine(queue, event, name, {"Xconvgemm"}, PrecisionValue(), {}, { (method == ConvGemmMethod::kWithIm2Col) ? "#define CONVGEMM_WITH_IM2COL\n" : "", #include "../../kernels/level3/level3.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_direct_part1.opencl" #include "../../kernels/level3/xgemm_direct_part2.opencl" #include "../../kernels/level3/xgemm_direct_part3.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/levelx/xconvgemm_part1.opencl" #include "../../kernels/levelx/xconvgemm_part2.opencl" }), method_(method) { } // ================================================================================================= template void Xconvgemm::DoConvgemm(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const Buffer &im_buffer, const size_t im_offset, const Buffer &kernel_buffer, const size_t kernel_offset, const Buffer &result_buffer, const size_t result_offset) { // Tests for a valid batch count if (batch_count == 0) { throw BLASError(StatusCode::kInvalidBatchCount); } // Makes sure all dimensions are larger than zero if ((channels == 0) || (height == 0) || (width == 0) || (num_kernels == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Sets the output height and width const auto size_h = height + 2 * pad_h; const auto padding_h = dilation_h * (kernel_h - 1) + 1; const auto output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1; const auto size_w = width + 2 * pad_w; const auto padding_w = dilation_w * (kernel_w - 1) + 1; const auto output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1; // Sets other useful variables const auto patch_size = kernel_h * kernel_w * channels; const auto num_patches = output_h * output_w; // Possible approach: im2col + GEMM // result = GEMM(im2col(image), kernel) auto col_buffer = Buffer(context_, 0); // nullptr, will be optionally created later if (method_ == ConvGemmMethod::kWithIm2Col) { // Temporary col matrix const auto col_size = (method_ == ConvGemmMethod::kWithIm2Col) ? patch_size * num_patches * batch_count : 1; col_buffer = Buffer(context_, col_size); // Loops over each batch for (auto batch_id = size_t{0}; batch_id < batch_count; ++batch_id) { // im2col const auto im_batch_offset = batch_id * channels * height * width + im_offset; const auto col_batch_offset = batch_id * patch_size * num_patches; auto im2col_event = Event(); auto im2col = Xim2col(queue_, im2col_event.pointer()); im2col.DoIm2col(kernel_mode, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, im_buffer, im_batch_offset, col_buffer, col_batch_offset); im2col_event.WaitForCompletion(); } } // Strided batched GEMM: C (result) = alpha (1) * A (col) * B (kernel) + beta (0) * C (result) const auto col_stride = patch_size * num_patches; const auto result_stride = num_kernels * output_h * output_w; // Tests the matrices for validity TestMatrixB(patch_size, num_kernels, kernel_buffer, kernel_offset, patch_size); for (auto batch = size_t{0}; batch < batch_count; ++batch) { if (method_ == ConvGemmMethod::kWithIm2Col) { TestMatrixA(num_patches, patch_size, col_buffer, col_stride * batch, num_patches); } else { // TODO: check for valid image tensor } TestMatrixC(num_patches, num_kernels, result_buffer, result_offset + result_stride * batch, num_patches); } // Retrieves the proper XgemmDirect kernel from the compiled binary const std::string kernel_name = (method_ == ConvGemmMethod::kWithIm2Col) ? "Xconvgemm" : (kernel_mode == KernelMode::kConvolution) ? "XconvgemmFlip" : "XconvgemmNormal"; auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(num_patches)); kernel.SetArgument(1, static_cast(num_kernels)); kernel.SetArgument(2, static_cast(patch_size)); kernel.SetArgument(3, kernel_buffer()); kernel.SetArgument(4, static_cast(kernel_offset)); kernel.SetArgument(5, result_buffer()); kernel.SetArgument(6, static_cast(result_offset)); kernel.SetArgument(7, static_cast(result_stride)); if (method_ == ConvGemmMethod::kWithIm2Col) { kernel.SetArgument(8, col_buffer()); kernel.SetArgument(9, static_cast(0)); kernel.SetArgument(10, static_cast(col_stride)); } if (method_ == ConvGemmMethod::kSingleKernel) { kernel.SetArgument(8, im_buffer()); kernel.SetArgument(9, static_cast(im_offset)); kernel.SetArgument(10, static_cast(height)); kernel.SetArgument(11, static_cast(width)); kernel.SetArgument(12, static_cast(channels)); kernel.SetArgument(13, static_cast(kernel_h)); kernel.SetArgument(14, static_cast(kernel_w)); kernel.SetArgument(15, static_cast(pad_h)); kernel.SetArgument(16, static_cast(pad_w)); kernel.SetArgument(17, static_cast(stride_h)); kernel.SetArgument(18, static_cast(stride_w)); kernel.SetArgument(19, static_cast(dilation_h)); kernel.SetArgument(20, static_cast(dilation_w)); kernel.SetArgument(21, static_cast(output_h)); kernel.SetArgument(22, static_cast(output_w)); } // Computes the global and local thread sizes const auto m_ceiled = Ceil(num_patches, db_["WGD"]); const auto n_ceiled = Ceil(num_kernels, db_["WGD"]); const auto global = std::vector{ (m_ceiled * db_["MDIMCD"]) / db_["WGD"], (n_ceiled * db_["NDIMCD"]) / db_["WGD"], batch_count }; const auto local = std::vector{db_["MDIMCD"], db_["NDIMCD"], 1}; // Launches the kernel RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= // Compiles the templated class template class Xconvgemm; template class Xconvgemm; template class Xconvgemm; template class Xconvgemm; template class Xconvgemm; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/levelx/xconvgemm.hpp000066400000000000000000000043511463263031500214650ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xconvgemm routine. The precision is implemented as a template argument. // This implements batched convolution of a 4D input 'image' tensor, a 3D input 'kernel' matrix, // resulting in a 4D output 'result' tensor. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XCONVGEMM_H_ #define CLBLAST_ROUTINES_XCONVGEMM_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xconvgemm: public Routine { public: // Constructor enum class ConvGemmMethod {kWithIm2Col, kSingleKernel}; Xconvgemm(Queue &queue, EventPointer event, const std::string &name = "CONVGEMM", const ConvGemmMethod method = ConvGemmMethod::kSingleKernel); // Templated-precision implementation of the routine void DoConvgemm(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const Buffer &im_buffer, const size_t im_offset, const Buffer &kernel_buffer, const size_t kernel_offset, const Buffer &result_buffer, const size_t result_offset); private: const ConvGemmMethod method_; }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XCONVGEMM_H_ #endif CLBlast-1.6.3/src/routines/levelx/xgemmbatched.cpp000066400000000000000000000426461463263031500221160ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the XgemmBatched class (see the header for information about the class). // // ================================================================================================= #include "routines/levelx/xgemmbatched.hpp" #include "routines/level3/xgemm.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template XgemmBatched::XgemmBatched(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"}, PrecisionValue(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" #include "../../kernels/level3/transpose_fast.opencl" #include "../../kernels/level3/transpose_pad.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_direct_part1.opencl" #include "../../kernels/level3/xgemm_direct_part2.opencl" #include "../../kernels/level3/xgemm_direct_part3.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_batched.opencl" #include "../../kernels/level3/xgemm_direct_batched.opencl" }) { } // ================================================================================================= // The main routine template void XgemmBatched::DoGemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const std::vector &alphas, const Buffer & a_buffer, const std::vector &a_offsets, const size_t a_ld, const Buffer & b_buffer, const std::vector &b_offsets, const size_t b_ld, const std::vector &betas, const Buffer & c_buffer, const std::vector &c_offsets, const size_t c_ld, const size_t batch_count) { // Tests for a valid batch count if ((batch_count < 1) || (alphas.size() != batch_count) || (betas.size() != batch_count) || (a_offsets.size() != batch_count) || (b_offsets.size() != batch_count) || (c_offsets.size() != batch_count)) { throw BLASError(StatusCode::kInvalidBatchCount); } // Two methods to choose from, select which one to run const auto do_gemm_direct = Xgemm::UseDirectKernel(m, n, k, db_["XGEMM_MIN_INDIRECT_SIZE"]); const auto gemm_kernel_id = (do_gemm_direct) ? 0 : db_["GEMMK"]; // Computes the transpose/conjugate options and sets the a/b/c sizes based on that bool a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate; size_t a_one, a_two, b_one, b_two, c_one, c_two; Xgemm::ProcessArguments(layout, a_transpose, b_transpose, m, n, k, a_one, a_two, b_one, b_two, c_one, c_two, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, gemm_kernel_id); // Tests the matrices for validity TestBatchedMatrixA(a_one, a_two, a_buffer, a_offsets, a_ld, false); // don't test for invalid LD TestBatchedMatrixB(b_one, b_two, b_buffer, b_offsets, b_ld, false); // don't test for invalid LD TestBatchedMatrixC(c_one, c_two, c_buffer, c_offsets, c_ld); // Upload the scalar arguments to the device auto alphas_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); auto betas_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); alphas_device.Write(queue_, batch_count, alphas); betas_device.Write(queue_, batch_count, betas); // Converts the offset to integers auto a_offsets_int = std::vector(batch_count); auto b_offsets_int = std::vector(batch_count); auto c_offsets_int = std::vector(batch_count); for (auto batch = size_t{ 0 }; batch < batch_count; ++batch) { a_offsets_int[batch] = static_cast(a_offsets[batch]); b_offsets_int[batch] = static_cast(b_offsets[batch]); c_offsets_int[batch] = static_cast(c_offsets[batch]); } // Selects which version of the batched GEMM to run if (do_gemm_direct) { // single generic kernel BatchedGemmDirect(m, n, k, alphas_device, a_buffer, a_offsets_int, a_ld, b_buffer, b_offsets_int, b_ld, betas_device, c_buffer, c_offsets_int, c_ld, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, batch_count); } else { // pre/post-processing plus a very fast kernel BatchedGemmIndirect(m, n, k, alphas_device, a_buffer, a_offsets_int, a_ld, b_buffer, b_offsets_int, b_ld, betas_device, c_buffer, c_offsets_int, c_ld, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, a_one, a_two, b_one, b_two, c_one, c_two, batch_count); } } // ================================================================================================= // The indirect version of batched GEMM. This uses the faster but non-general kernel. It has specific // requirements, but several pre and post-processing kernels take care of those. However, the // overhead of these extra kernels might not be ideal for certain devices/arguments. template void XgemmBatched::BatchedGemmIndirect(const size_t m, const size_t n, const size_t k, const Buffer &alphas, const Buffer &a_buffer, const std::vector &a_offsets, const size_t a_ld, const Buffer &b_buffer, const std::vector &b_offsets, const size_t b_ld, const Buffer &betas, const Buffer &c_buffer, const std::vector &c_offsets, const size_t c_ld, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate, const size_t a_one, const size_t a_two, const size_t b_one, const size_t b_two, const size_t c_one, const size_t c_two, const size_t batch_count) { // Calculates the ceiled versions of m, n, and k const auto m_ceiled = Ceil(Ceil(m, db_["MWG"]), db_["VWM"]); const auto n_ceiled = Ceil(Ceil(n, db_["NWG"]), db_["VWN"]); const auto k_ceiled = Ceil(Ceil(k, db_["KWG"]), db_["VWM"]); // Computes the first and second "internal" (ceiled) dimensions of the 3 matrices taking into account // whether the matrices need to be rotated or not for the kernel. size_t a_one_i, a_two_i, b_one_i, b_two_i, c_one_i, c_two_i; Xgemm::CalculateInternalDimensions(m, n, k, db_["MWG"], db_["NWG"], db_["KWG"], a_one_i, a_two_i, b_one_i, b_two_i, c_one_i, c_two_i, db_["GEMMK"]); // Sets the "internal" offsets, i.e. the perfect offsets auto a_offsets_i = std::vector(batch_count); auto b_offsets_i = std::vector(batch_count); auto c_offsets_i = std::vector(batch_count); for (auto batch = size_t{0}; batch < batch_count; ++batch) { a_offsets_i[batch] = static_cast(batch * a_one_i * a_two_i); b_offsets_i[batch] = static_cast(batch * b_one_i * b_two_i); c_offsets_i[batch] = static_cast(batch * c_one_i * c_two_i); } // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offsets == a_offsets_i && !a_do_transpose && !a_conjugate; auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offsets == b_offsets_i && !b_do_transpose && !b_conjugate; auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offsets == c_offsets_i && !c_do_transpose; // Creates the temporary matrices const auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, batch_count * a_one_i * a_two_i); const auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, batch_count * b_one_i * b_two_i); const auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, batch_count * c_one_i * c_two_i); // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto a_offsets_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); auto a_offsets_i_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); a_offsets_device.Write(queue_, batch_count, a_offsets); a_offsets_i_device.Write(queue_, batch_count, a_offsets_i); auto eventProcessA = Event(); PadCopyTransposeMatrixBatched(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offsets_device, a_buffer, a_one_i, a_two_i, a_one_i, a_offsets_i_device, a_temp, program_, true, a_do_transpose, a_conjugate, batch_count); eventWaitList.push_back(eventProcessA); } // As above, but now for matrix B if (!b_no_temp) { auto b_offsets_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); auto b_offsets_i_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); b_offsets_device.Write(queue_, batch_count, b_offsets); b_offsets_i_device.Write(queue_, batch_count, b_offsets_i); auto eventProcessB = Event(); PadCopyTransposeMatrixBatched(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offsets_device, b_buffer, b_one_i, b_two_i, b_one_i, b_offsets_i_device, b_temp, program_, true, b_do_transpose, b_conjugate, batch_count); eventWaitList.push_back(eventProcessB); } // As above, but now for matrix C auto c_offsets_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); auto c_offsets_i_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); if (!c_no_temp) { c_offsets_device.Write(queue_, batch_count, c_offsets); c_offsets_i_device.Write(queue_, batch_count, c_offsets_i); auto eventProcessC = Event(); PadCopyTransposeMatrixBatched(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, c_one, c_two, c_ld, c_offsets_device, c_buffer, c_one_i, c_two_i, c_one_i, c_offsets_i_device, c_temp, program_, true, c_do_transpose, false, batch_count); eventWaitList.push_back(eventProcessC); } // Retrieves the Xgemm kernel from the compiled binary auto kernel = Kernel(program_, "XgemmBatched"); // Sets the kernel arguments kernel.SetArgument(0, static_cast(m_ceiled)); kernel.SetArgument(1, static_cast(n_ceiled)); kernel.SetArgument(2, static_cast(k_ceiled)); kernel.SetArgument(3, alphas()); kernel.SetArgument(4, betas()); kernel.SetArgument(5, a_temp()); kernel.SetArgument(6, static_cast(a_one_i)); kernel.SetArgument(7, static_cast(a_two_i)); kernel.SetArgument(8, b_temp()); kernel.SetArgument(9, static_cast(b_one_i)); kernel.SetArgument(10, static_cast(b_two_i)); kernel.SetArgument(11, c_temp()); kernel.SetArgument(12, static_cast(c_one_i)); kernel.SetArgument(13, static_cast(c_two_i)); // Computes the global and local thread sizes const auto global = std::vector{ (c_one_i * db_["MDIMC"]) / db_["MWG"], (c_two_i * db_["NDIMC"]) / db_["NWG"], batch_count }; const auto local = std::vector{db_["MDIMC"], db_["NDIMC"], 1}; // Launches the kernel auto eventKernel = Event(); auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList); // Runs the post-processing kernel if needed if (!c_no_temp) { eventWaitList.push_back(eventKernel); PadCopyTransposeMatrixBatched(queue_, device_, db_, event_, eventWaitList, c_one_i, c_two_i, c_one_i, c_offsets_i_device, c_temp, c_one, c_two, c_ld, c_offsets_device, c_buffer, program_, false, c_do_transpose, false, batch_count); } } // ================================================================================================= // The direct version of batched GEMM, requiring just one kernel, no pre or post-processing kernels. template void XgemmBatched::BatchedGemmDirect(const size_t m, const size_t n, const size_t k, const Buffer &alphas, const Buffer &a_buffer, const std::vector &a_offsets, const size_t a_ld, const Buffer &b_buffer, const std::vector &b_offsets, const size_t b_ld, const Buffer &betas, const Buffer &c_buffer, const std::vector &c_offsets, const size_t c_ld, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate, const size_t batch_count) { // Uploads the offsets to the device auto a_offsets_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); auto b_offsets_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); auto c_offsets_device = Buffer(context_, BufferAccess::kReadWrite, batch_count); a_offsets_device.Write(queue_, batch_count, a_offsets); b_offsets_device.Write(queue_, batch_count, b_offsets); c_offsets_device.Write(queue_, batch_count, c_offsets); // Retrieves the proper XgemmDirect kernel from the compiled binary const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectBatchedTT" : "XgemmDirectBatchedTN") : (b_do_transpose ? "XgemmDirectBatchedNT" : "XgemmDirectBatchedNN"); auto kernel = Kernel(program_, name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(m)); kernel.SetArgument(1, static_cast(n)); kernel.SetArgument(2, static_cast(k)); kernel.SetArgument(3, alphas()); kernel.SetArgument(4, betas()); kernel.SetArgument(5, a_buffer()); kernel.SetArgument(6, a_offsets_device()); kernel.SetArgument(7, static_cast(a_ld)); kernel.SetArgument(8, b_buffer()); kernel.SetArgument(9, b_offsets_device()); kernel.SetArgument(10, static_cast(b_ld)); kernel.SetArgument(11, c_buffer()); kernel.SetArgument(12, c_offsets_device()); kernel.SetArgument(13, static_cast(c_ld)); kernel.SetArgument(14, static_cast(c_do_transpose)); kernel.SetArgument(15, static_cast(a_conjugate)); kernel.SetArgument(16, static_cast(b_conjugate)); // Computes the global and local thread sizes const auto m_ceiled = Ceil(m, db_["WGD"]); const auto n_ceiled = Ceil(n, db_["WGD"]); const auto global = std::vector{ (m_ceiled * db_["MDIMCD"]) / db_["WGD"], (n_ceiled * db_["NDIMCD"]) / db_["WGD"], batch_count }; const auto local = std::vector{db_["MDIMCD"], db_["NDIMCD"], 1}; // Launches the kernel RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= // Compiles the templated class template class XgemmBatched; template class XgemmBatched; template class XgemmBatched; template class XgemmBatched; template class XgemmBatched; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/levelx/xgemmbatched.hpp000066400000000000000000000072641463263031500221200ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the XgemmBatched routine. This is a non-blas batched version of GEMM. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XGEMMBATCHED_H_ #define CLBLAST_ROUTINES_XGEMMBATCHED_H_ #include #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class XgemmBatched: public Routine { public: // Constructor XgemmBatched(Queue &queue, EventPointer event, const std::string &name = "GEMMBATCHED"); // Templated-precision implementation of the routine void DoGemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const std::vector &alphas, const Buffer & a_buffer, const std::vector &a_offsets, const size_t a_ld, const Buffer & b_buffer, const std::vector &b_offsets, const size_t b_ld, const std::vector &betas, const Buffer & c_buffer, const std::vector &c_offsets, const size_t c_ld, const size_t batch_count); // Indirect version of batched GEMM (with pre and post-processing kernels) void BatchedGemmIndirect(const size_t m, const size_t n, const size_t k, const Buffer &alphas, const Buffer &a_buffer, const std::vector &a_offsets, const size_t a_ld, const Buffer &b_buffer, const std::vector &b_offsets, const size_t b_ld, const Buffer &betas, const Buffer &c_buffer, const std::vector &c_offsets, const size_t c_ld, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate, const size_t a_one, const size_t a_two, const size_t b_one, const size_t b_two, const size_t c_one, const size_t c_two, const size_t batch_count); // Direct version of batched GEMM (no pre and post-processing kernels) void BatchedGemmDirect(const size_t m, const size_t n, const size_t k, const Buffer &alphas, const Buffer &a_buffer, const std::vector &a_offsets, const size_t a_ld, const Buffer &b_buffer, const std::vector &b_offsets, const size_t b_ld, const Buffer &betas, const Buffer &c_buffer, const std::vector &c_offsets, const size_t c_ld, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate, const size_t batch_count); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XGEMMBATCHED_H_ #endif CLBlast-1.6.3/src/routines/levelx/xgemmstridedbatched.cpp000066400000000000000000000363261463263031500234730ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the XgemmStridedBatched class (see the header for information about the class). // // ================================================================================================= #include "routines/levelx/xgemmstridedbatched.hpp" #include "routines/level3/xgemm.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template XgemmStridedBatched::XgemmStridedBatched(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"}, PrecisionValue(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" #include "../../kernels/level3/transpose_fast.opencl" #include "../../kernels/level3/transpose_pad.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_direct_part1.opencl" #include "../../kernels/level3/xgemm_direct_part2.opencl" #include "../../kernels/level3/xgemm_direct_part3.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_batched.opencl" #include "../../kernels/level3/xgemm_direct_batched.opencl" }) { } // ================================================================================================= // The main routine template void XgemmStridedBatched::DoGemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count) { // Tests for a valid batch count if (batch_count < 1) { throw BLASError(StatusCode::kInvalidBatchCount); } // Makes sure the strides are valid if (c_stride == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Two methods to choose from, select which one to run const auto do_gemm_direct = Xgemm::UseDirectKernel(m, n, k, db_["XGEMM_MIN_INDIRECT_SIZE"]); const auto gemm_kernel_id = (do_gemm_direct) ? 0 : db_["GEMMK"]; // Computes the transpose/conjugate options and sets the a/b/c sizes based on that bool a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate; size_t a_one, a_two, b_one, b_two, c_one, c_two; Xgemm::ProcessArguments(layout, a_transpose, b_transpose, m, n, k, a_one, a_two, b_one, b_two, c_one, c_two, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, gemm_kernel_id); // Tests the matrices for validity TestStridedBatchedMatrixA(a_one, a_two, a_buffer, a_offset, a_stride, batch_count, a_ld); TestStridedBatchedMatrixB(b_one, b_two, b_buffer, b_offset, b_stride, batch_count, b_ld); TestStridedBatchedMatrixC(c_one, c_two, c_buffer, c_offset, c_stride, batch_count, c_ld); // Selects which version of the batched GEMM to run if (do_gemm_direct) { // single generic kernel BatchedGemmDirect(m, n, k, alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, beta, c_buffer, c_offset, c_ld, c_stride, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, batch_count); } else { // pre/post-processing plus a very fast kernel BatchedGemmIndirect(m, n, k, alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, beta, c_buffer, c_offset, c_ld, c_stride, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, a_one, a_two, b_one, b_two, c_one, c_two, batch_count); } } // ================================================================================================= // The indirect version of batched GEMM. This uses the faster but non-general kernel. It has specific // requirements, but several pre and post-processing kernels take care of those. However, the // overhead of these extra kernels might not be ideal for certain devices/arguments. template void XgemmStridedBatched::BatchedGemmIndirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate, const size_t a_one, const size_t a_two, const size_t b_one, const size_t b_two, const size_t c_one, const size_t c_two, const size_t batch_count) { // Calculates the ceiled versions of m, n, and k const auto m_ceiled = Ceil(Ceil(m, db_["MWG"]), db_["VWM"]); const auto n_ceiled = Ceil(Ceil(n, db_["NWG"]), db_["VWN"]); const auto k_ceiled = Ceil(Ceil(k, db_["KWG"]), db_["VWM"]); // Computes the first and second "internal" (ceiled) dimensions of the 3 matrices taking into account // whether the matrices need to be rotated or not for the kernel. size_t a_one_i, a_two_i, b_one_i, b_two_i, c_one_i, c_two_i; Xgemm::CalculateInternalDimensions(m, n, k, db_["MWG"], db_["NWG"], db_["KWG"], a_one_i, a_two_i, b_one_i, b_two_i, c_one_i, c_two_i, db_["GEMMK"]); // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && !a_do_transpose && !a_conjugate; auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && !b_do_transpose && !b_conjugate; auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && !c_do_transpose; // Creates the temporary matrices const auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, batch_count * a_one_i * a_two_i); const auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, batch_count * b_one_i * b_two_i); const auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, batch_count * c_one_i * c_two_i); // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto eventProcessA = Event(); PadCopyTransposeMatrixStridedBatched(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_stride, a_buffer, a_one_i, a_two_i, a_one_i, 0, a_one_i * a_two_i, a_temp, program_, true, a_do_transpose, a_conjugate, batch_count); eventWaitList.push_back(eventProcessA); } // As above, but now for matrix B if (!b_no_temp) { auto eventProcessB = Event(); PadCopyTransposeMatrixStridedBatched(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offset, b_stride, b_buffer, b_one_i, b_two_i, b_one_i, 0, b_one_i * b_two_i, b_temp, program_, true, b_do_transpose, b_conjugate, batch_count); eventWaitList.push_back(eventProcessB); } // As above, but now for matrix C if (!c_no_temp) { auto eventProcessC = Event(); PadCopyTransposeMatrixStridedBatched(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, c_one, c_two, c_ld, c_offset, c_stride, c_buffer, c_one_i, c_two_i, c_one_i, 0, c_one_i * c_two_i, c_temp, program_, true, c_do_transpose, false, batch_count); eventWaitList.push_back(eventProcessC); } // Retrieves the Xgemm kernel from the compiled binary auto kernel = Kernel(program_, "XgemmStridedBatched"); // Sets the kernel arguments kernel.SetArgument(0, static_cast(m_ceiled)); kernel.SetArgument(1, static_cast(n_ceiled)); kernel.SetArgument(2, static_cast(k_ceiled)); kernel.SetArgument(3, GetRealArg(alpha)); kernel.SetArgument(4, GetRealArg(beta)); kernel.SetArgument(5, a_temp()); kernel.SetArgument(6, static_cast(a_one_i)); kernel.SetArgument(7, static_cast(a_two_i)); kernel.SetArgument(8, b_temp()); kernel.SetArgument(9, static_cast(b_one_i)); kernel.SetArgument(10, static_cast(b_two_i)); kernel.SetArgument(11, c_temp()); kernel.SetArgument(12, static_cast(c_one_i)); kernel.SetArgument(13, static_cast(c_two_i)); // Computes the global and local thread sizes const auto global = std::vector{ (c_one_i * db_["MDIMC"]) / db_["MWG"], (c_two_i * db_["NDIMC"]) / db_["NWG"], batch_count }; const auto local = std::vector{db_["MDIMC"], db_["NDIMC"], 1}; // Launches the kernel auto eventKernel = Event(); auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList); // Runs the post-processing kernel if needed if (!c_no_temp) { eventWaitList.push_back(eventKernel); PadCopyTransposeMatrixStridedBatched(queue_, device_, db_, event_, eventWaitList, c_one_i, c_two_i, c_one_i, 0, c_one_i * c_two_i, c_temp, c_one, c_two, c_ld, c_offset, c_stride, c_buffer, program_, false, c_do_transpose, false, batch_count); } } // ================================================================================================= // The direct version of batched GEMM, requiring just one kernel, no pre or post-processing kernels. template void XgemmStridedBatched::BatchedGemmDirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate, const size_t batch_count) { // Retrieves the proper XgemmDirect kernel from the compiled binary const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectStridedBatchedTT" : "XgemmDirectStridedBatchedTN") : (b_do_transpose ? "XgemmDirectStridedBatchedNT" : "XgemmDirectStridedBatchedNN"); auto kernel = Kernel(program_, name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(m)); kernel.SetArgument(1, static_cast(n)); kernel.SetArgument(2, static_cast(k)); kernel.SetArgument(3, GetRealArg(alpha)); kernel.SetArgument(4, GetRealArg(beta)); kernel.SetArgument(5, a_buffer()); kernel.SetArgument(6, static_cast(a_offset)); kernel.SetArgument(7, static_cast(a_ld)); kernel.SetArgument(8, static_cast(a_stride)); kernel.SetArgument(9, b_buffer()); kernel.SetArgument(10, static_cast(b_offset)); kernel.SetArgument(11, static_cast(b_ld)); kernel.SetArgument(12, static_cast(b_stride)); kernel.SetArgument(13, c_buffer()); kernel.SetArgument(14, static_cast(c_offset)); kernel.SetArgument(15, static_cast(c_ld)); kernel.SetArgument(16, static_cast(c_stride)); kernel.SetArgument(17, static_cast(c_do_transpose)); kernel.SetArgument(18, static_cast(a_conjugate)); kernel.SetArgument(19, static_cast(b_conjugate)); // Computes the global and local thread sizes const auto m_ceiled = Ceil(m, db_["WGD"]); const auto n_ceiled = Ceil(n, db_["WGD"]); const auto global = std::vector{ (m_ceiled * db_["MDIMCD"]) / db_["WGD"], (n_ceiled * db_["NDIMCD"]) / db_["WGD"], batch_count }; const auto local = std::vector{db_["MDIMCD"], db_["NDIMCD"], 1}; // Launches the kernel RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= // Compiles the templated class template class XgemmStridedBatched; template class XgemmStridedBatched; template class XgemmStridedBatched; template class XgemmStridedBatched; template class XgemmStridedBatched; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/levelx/xgemmstridedbatched.hpp000066400000000000000000000072351463263031500234750ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the XgemmStridedBatched routine. This is a non-blas batched version of GEMM. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XGEMMSTRIDEDBATCHED_H_ #define CLBLAST_ROUTINES_XGEMMSTRIDEDBATCHED_H_ #include #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class XgemmStridedBatched: public Routine { public: // Constructor XgemmStridedBatched(Queue &queue, EventPointer event, const std::string &name = "GEMMSTRIDEDBATCHED"); // Templated-precision implementation of the routine void DoGemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count); // Indirect version of strided batched GEMM (with pre and post-processing kernels) void BatchedGemmIndirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate, const size_t a_one, const size_t a_two, const size_t b_one, const size_t b_two, const size_t c_one, const size_t c_two, const size_t batch_count); // Direct version of strided batched GEMM (no pre and post-processing kernels) void BatchedGemmDirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate, const size_t batch_count); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XGEMMSTRIDEDBATCHED_H_ #endif CLBlast-1.6.3/src/routines/levelx/xhad.cpp000066400000000000000000000112431463263031500203770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhad class (see the header for information about the class). // // ================================================================================================= #include "routines/levelx/xhad.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xhad::Xhad(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xaxpy"}, PrecisionValue(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xhad.opencl" }) { } // ================================================================================================= // The main routine template void Xhad::DoHad(const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const T beta, const Buffer &z_buffer, const size_t z_offset, const size_t z_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity TestVectorX(n, x_buffer, x_offset, x_inc); TestVectorY(n, y_buffer, y_offset, y_inc); TestVectorY(n, z_buffer, z_offset, z_inc); // TODO: Make a TestVectorZ function with error codes // Determines whether or not the fast-version can be used const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) && (y_offset == 0) && (y_inc == 1) && (z_offset == 0) && (z_inc == 1) && IsMultiple(n, db_["WPT"]*db_["VW"]); const auto use_fastest_kernel = use_faster_kernel && IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); // If possible, run the fast-version of the kernel const auto kernel_name = (use_fastest_kernel) ? "XhadFastest" : (use_faster_kernel) ? "XhadFaster" : "Xhad"; // Retrieves the Xhad kernel from the compiled binary auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_faster_kernel || use_fastest_kernel) { kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, GetRealArg(beta)); kernel.SetArgument(3, x_buffer()); kernel.SetArgument(4, y_buffer()); kernel.SetArgument(5, z_buffer()); } else { kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, GetRealArg(beta)); kernel.SetArgument(3, x_buffer()); kernel.SetArgument(4, static_cast(x_offset)); kernel.SetArgument(5, static_cast(x_inc)); kernel.SetArgument(6, y_buffer()); kernel.SetArgument(7, static_cast(y_offset)); kernel.SetArgument(8, static_cast(y_inc)); kernel.SetArgument(9, z_buffer()); kernel.SetArgument(10, static_cast(z_offset)); kernel.SetArgument(11, static_cast(z_inc)); } // Launches the kernel if (use_fastest_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } else if (use_faster_kernel) { auto global = std::vector{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } else { const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } } // ================================================================================================= // Compiles the templated class template class Xhad; template class Xhad; template class Xhad; template class Xhad; template class Xhad; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/levelx/xhad.hpp000066400000000000000000000030271463263031500204050ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xhad routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XHAD_H_ #define CLBLAST_ROUTINES_XHAD_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xhad: public Routine { public: // Constructor Xhad(Queue &queue, EventPointer event, const std::string &name = "HAD"); // Templated-precision implementation of the routine void DoHad(const size_t n, const T alpha, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, const T beta, const Buffer &z_buffer, const size_t z_offset, const size_t z_inc); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XHAD_H_ #endif CLBlast-1.6.3/src/routines/levelx/xim2col.cpp000066400000000000000000000103441463263031500210310ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xim2col class (see the header for information about the class). // // ================================================================================================= #include "routines/levelx/xim2col.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xim2col::Xim2col(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy"}, PrecisionValue(), {}, { #include "../../kernels/levelx/im2col.opencl" }) { } // ================================================================================================= // The main routine template void Xim2col::DoIm2col(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const Buffer &im_buffer, const size_t im_offset, const Buffer &col_buffer, const size_t col_offset) { // Flip the output along kernel_h and kernel_w, or not. const auto kernel_name = (kernel_mode == KernelMode::kConvolution) ? "Xim2colKernelFlip" : "Xim2colKernelNormal"; // Makes sure all dimensions are larger than zero if ((channels == 0) || (height == 0) || (width == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Sets the height and width of the 'col' result const auto size_h = height + 2 * pad_h; const auto padding_h = dilation_h * (kernel_h - 1) + 1; const auto col_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1; const auto size_w = width + 2 * pad_w; const auto padding_w = dilation_w * (kernel_w - 1) + 1; const auto col_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1; // Retrieves the kernel from the compiled binary auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(height)); kernel.SetArgument(1, static_cast(width)); kernel.SetArgument(2, static_cast(channels)); kernel.SetArgument(3, static_cast(col_h)); kernel.SetArgument(4, static_cast(col_w)); kernel.SetArgument(5, static_cast(kernel_h)); kernel.SetArgument(6, static_cast(kernel_w)); kernel.SetArgument(7, static_cast(pad_h)); kernel.SetArgument(8, static_cast(pad_w)); kernel.SetArgument(9, static_cast(stride_h)); kernel.SetArgument(10, static_cast(stride_w)); kernel.SetArgument(11, static_cast(dilation_h)); kernel.SetArgument(12, static_cast(dilation_w)); kernel.SetArgument(13, im_buffer()); kernel.SetArgument(14, static_cast(im_offset)); kernel.SetArgument(15, col_buffer()); kernel.SetArgument(16, static_cast(col_offset)); // Launches the kernel const auto w_ceiled = Ceil(col_w, db_["COPY_DIMX"]); const auto h_ceiled = Ceil(col_h, db_["COPY_DIMY"]); const auto global = std::vector{w_ceiled, h_ceiled * channels}; const auto local = std::vector{db_["COPY_DIMX"], db_["COPY_DIMY"]}; RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= // Compiles the templated class template class Xim2col; template class Xim2col; template class Xim2col; template class Xim2col; template class Xim2col; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/levelx/xim2col.hpp000066400000000000000000000034641463263031500210430ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xim2col routine. The precision is implemented using a template argument. // Uses the tuning parameters from the regular copy kernel. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XIM2COL_H_ #define CLBLAST_ROUTINES_XIM2COL_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xim2col: public Routine { public: // Constructor Xim2col(Queue &queue, EventPointer event, const std::string &name = "IM2COL"); // Templated-precision implementation of the routine void DoIm2col(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const Buffer &im_buffer, const size_t im_offset, const Buffer &col_buffer, const size_t col_offset); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XIM2COL_H_ #endif CLBlast-1.6.3/src/routines/levelx/xinvert.cpp000066400000000000000000000200531463263031500211510ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains all the common code to perform (partial) matrix inverting. This code is based // on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular // Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, // and Jack Dongarra. // // ================================================================================================= #include "routines/levelx/xinvert.hpp" #include #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xinvert::Xinvert(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Invert"}, PrecisionValue(), {}, { #include "../../kernels/level3/level3.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/invert_diagonal_blocks_part1.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/invert_diagonal_blocks_part2.opencl" }) { } // ================================================================================================= // Inverts diagonal square blocks of a matrix template void Xinvert::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag, const size_t n, const size_t block_size, const Buffer &src, const size_t offset, const size_t ld_src, Buffer &dest) { // Makes sure all dimensions are larger than zero if ((block_size == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Some parts of this kernel are not tunable and thus require some minimal OpenCL properties if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16 throw RuntimeErrorCode(StatusCode::kNotImplemented); } // Helper variables const auto internal_block_size = static_cast(db_["INTERNAL_BLOCK_SIZE"]); if (internal_block_size != 16) { throw RuntimeErrorCode(StatusCode::kNotImplemented); // e.g. Apple CPU OpenCL with a WGS of 1 } // when barriers are present const auto num_blocks = CeilDiv(n, block_size); const auto num_internal_blocks = CeilDiv(n, internal_block_size); const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false; // This routine only supports block sizes which are a multiple of the internal block size and // block sizes up to and including 128 if ((block_size % internal_block_size != 0) || (block_size > 128)) { throw BLASError(StatusCode::kUnknownError); } // Checks for validity of the source and destination matrices TestMatrixA(n, n, src, offset, ld_src); TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size); // Determines which kernels to run based on the layout (the kernels assume column-major as // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); const auto name_postfix = (is_upper) ? "Upper" : "Lower"; // Fills the output buffer with zeros auto event_wait_list = std::vector(); auto fill_matrix_event = Event(); FillMatrix(queue_, device_, program_, fill_matrix_event.pointer(), event_wait_list, block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero(), 16); event_wait_list.push_back(fill_matrix_event); // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group auto kernel = Kernel(program_, "InvertDiagonalBlock"); kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, src()); kernel.SetArgument(2, static_cast(offset)); kernel.SetArgument(3, static_cast(ld_src)); kernel.SetArgument(4, dest()); kernel.SetArgument(5, static_cast(block_size)); kernel.SetArgument(6, static_cast(unit_diagonal)); kernel.SetArgument(7, static_cast(is_upper)); const auto local_invert = std::vector{internal_block_size}; const auto global_invert = std::vector{num_internal_blocks * internal_block_size}; auto base_kernel_event = Event(); auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer(); RunKernel(kernel, queue_, device_, global_invert, local_invert, base_kernel_event_pointer, event_wait_list); if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); } // Builds up block_size x block_size blocks. For example, internal_block_size=16: // use 16 x 16 blocks to build 32 x 32 blocks, 1 x (1 x npages) grid, 4 x 4 threads; // then 32 x 32 blocks to build 64 x 64 blocks, 1 x (2 x npages) grid, 8 x 4 threads; // then 64 x 64 blocks to build 128 x 128 blocks, 1 x (4 x npages) grid, 16 x 4 threads; for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) { assert(current_size == 16 || current_size == 32 || current_size == 64); // Emulates a 3D grid: NX * (NY * npages) const auto npages = CeilDiv(n, current_size*2); const auto local0 = (current_size <= 32) ? current_size/4 : 16; const auto local = std::vector{local0, 4}; const auto global = std::vector{Ceil(current_size/local[1], local[0]), Ceil(npages*(current_size/16)*local[1], local[1])}; // Part 1 auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix); kernel1.SetArgument(0, static_cast(n)); kernel1.SetArgument(1, src()); kernel1.SetArgument(2, static_cast(offset)); kernel1.SetArgument(3, static_cast(ld_src)); kernel1.SetArgument(4, dest()); kernel1.SetArgument(5, static_cast(current_size)); kernel1.SetArgument(6, static_cast(npages)); kernel1.SetArgument(7, static_cast(block_size)); auto kernel1_event = Event(); RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list); event_wait_list.push_back(kernel1_event); // Part 2 const bool is_last_kernel = (current_size * 2 >= block_size); auto kernel2 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix); kernel2.SetArgument(0, static_cast(n)); kernel2.SetArgument(1, dest()); kernel2.SetArgument(2, static_cast(current_size)); kernel2.SetArgument(3, static_cast(npages)); kernel2.SetArgument(4, static_cast(block_size)); auto kernel2_event = Event(); auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer(); RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list); if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); } // Exit in case we reach beyond the bounds of the input matrix if (current_size*2 >= n) { break; } } } // ================================================================================================= // Compiles the templated class template class Xinvert; template class Xinvert; template class Xinvert; template class Xinvert; template class Xinvert; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/levelx/xinvert.hpp000066400000000000000000000027471463263031500211700ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains all the common code to perform (partial) matrix inverting. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XINVERT_H_ #define CLBLAST_ROUTINES_XINVERT_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= template class Xinvert: public Routine { public: // Constructor Xinvert(Queue &queue, EventPointer event, const std::string &name = "INVERT"); // Inverts diagonal square blocks of a matrix void InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag, const size_t n, const size_t block_size, const Buffer &src, const size_t offset, const size_t ld_src, Buffer &dest); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XINVERT_H_ #endif CLBlast-1.6.3/src/routines/levelx/xomatcopy.cpp000066400000000000000000000074661463263031500215120ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xomatcopy class (see the header for information about the class). // // ================================================================================================= #include "routines/levelx/xomatcopy.hpp" #include #include namespace clblast { // ================================================================================================= // Constructor: forwards to base class constructor template Xomatcopy::Xomatcopy(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" #include "../../kernels/level3/transpose_fast.opencl" #include "../../kernels/level3/transpose_pad.opencl" }) { } // ================================================================================================= // The main routine template void Xomatcopy::DoOmatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Determines whether to transpose the matrix A const auto transpose = (a_transpose != Transpose::kNo); // In case of complex data-types, the transpose can also become a conjugate transpose const auto conjugate = (a_transpose == Transpose::kConjugate); // Computes the dimensions of the two matrices const auto rotated = (layout == Layout::kRowMajor); const auto a_one = (rotated) ? n : m; const auto a_two = (rotated) ? m : n; const auto b_one = (transpose) ? a_two : a_one; const auto b_two = (transpose) ? a_one : a_two; // Tests the matrices for validity, first from a perspective of the OpenCL buffers and their // sizes, and then from a perspective of parameter values (e.g. m, n). Tests whether the OpenCL // buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage space. // Also tests that the leading dimensions of: // matrix A cannot be less than N when rotated, or less than M when not-rotated // matrix B cannot be less than M when rotated, or less than N when not-rotated TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); auto emptyEventList = std::vector(); PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, b_one, b_two, b_ld, b_offset, b_buffer, alpha, program_, false, transpose, conjugate); } // ================================================================================================= // Compiles the templated class template class Xomatcopy; template class Xomatcopy; template class Xomatcopy; template class Xomatcopy; template class Xomatcopy; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/routines/levelx/xomatcopy.hpp000066400000000000000000000030731463263031500215050ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Xomatcopy routine. The precision is implemented using a template argument. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_XOMATCOPY_H_ #define CLBLAST_ROUTINES_XOMATCOPY_H_ #include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Xomatcopy: public Routine { public: // Constructor Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY"); // Templated-precision implementation of the routine void DoOmatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &b_buffer, const size_t b_offset, const size_t b_ld); }; // ================================================================================================= } // namespace clblast // CLBLAST_ROUTINES_XOMATCOPY_H_ #endif CLBlast-1.6.3/src/routines/routines.hpp000066400000000000000000000055401463263031500200340ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains all the includes of all the routines in CLBlast. // // ================================================================================================= #ifndef CLBLAST_ROUTINES_ROUTINES_H_ #define CLBLAST_ROUTINES_ROUTINES_H_ // BLAS level-1 includes #include "routines/level1/xswap.hpp" #include "routines/level1/xscal.hpp" #include "routines/level1/xcopy.hpp" #include "routines/level1/xaxpy.hpp" #include "routines/level1/xdot.hpp" #include "routines/level1/xdotu.hpp" #include "routines/level1/xdotc.hpp" #include "routines/level1/xnrm2.hpp" #include "routines/level1/xasum.hpp" #include "routines/level1/xsum.hpp" // non-BLAS routine #include "routines/level1/xamax.hpp" #include "routines/level1/xamin.hpp" // non-BLAS routine #include "routines/level1/xmax.hpp" // non-BLAS routine #include "routines/level1/xmin.hpp" // non-BLAS routine // BLAS level-2 includes #include "routines/level2/xgemv.hpp" #include "routines/level2/xgbmv.hpp" #include "routines/level2/xhemv.hpp" #include "routines/level2/xhbmv.hpp" #include "routines/level2/xhpmv.hpp" #include "routines/level2/xsymv.hpp" #include "routines/level2/xsbmv.hpp" #include "routines/level2/xspmv.hpp" #include "routines/level2/xtrmv.hpp" #include "routines/level2/xtbmv.hpp" #include "routines/level2/xtpmv.hpp" #include "routines/level2/xtrsv.hpp" #include "routines/level2/xger.hpp" #include "routines/level2/xgeru.hpp" #include "routines/level2/xgerc.hpp" #include "routines/level2/xher.hpp" #include "routines/level2/xhpr.hpp" #include "routines/level2/xher2.hpp" #include "routines/level2/xhpr2.hpp" #include "routines/level2/xsyr.hpp" #include "routines/level2/xspr.hpp" #include "routines/level2/xsyr2.hpp" #include "routines/level2/xspr2.hpp" // BLAS level-3 includes #include "routines/level3/xgemm.hpp" #include "routines/level3/xsymm.hpp" #include "routines/level3/xhemm.hpp" #include "routines/level3/xsyrk.hpp" #include "routines/level3/xherk.hpp" #include "routines/level3/xsyr2k.hpp" #include "routines/level3/xher2k.hpp" #include "routines/level3/xtrmm.hpp" #include "routines/level3/xtrsm.hpp" // Level-x includes (non-BLAS) #include "routines/levelx/xhad.hpp" #include "routines/levelx/xomatcopy.hpp" #include "routines/levelx/xim2col.hpp" #include "routines/levelx/xcol2im.hpp" #include "routines/levelx/xconvgemm.hpp" #include "routines/levelx/xaxpybatched.hpp" #include "routines/levelx/xgemmbatched.hpp" #include "routines/levelx/xgemmstridedbatched.hpp" // CLBLAST_ROUTINES_ROUTINES_H_ #endif CLBlast-1.6.3/src/tuning/000077500000000000000000000000001463263031500151035ustar00rootroot00000000000000CLBlast-1.6.3/src/tuning/configurations.cpp000066400000000000000000000154261463263031500206510ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune). // This is only used for the optional tuner binaries and not part of the core of CLBlast. // // ================================================================================================= #include #include #include "tuning/configurations.hpp" namespace clblast { // ================================================================================================= // Finds all configurations. It also applies the user-defined constraints within. std::vector SetConfigurations(const Device& device, const std::vector parameters, const std::vector& local_size_base, const TransformVector& mul_local_config, const TransformVector& div_local_config, const Constraints& constraints, const LocalMemSizeInfo& local_mem_size_info) { const auto local_mem_max = device.LocalMemSize(); const auto max_work_item_sizes = device.MaxWorkItemSizes(); const auto max_work_group_size = device.MaxWorkGroupSize(); auto config = Configuration(); auto configurations = std::vector(); PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config, 0, config, configurations, local_mem_max, constraints, local_mem_size_info, max_work_item_sizes, max_work_group_size); return configurations; } // Iterates recursively over all permutations of the user-defined parameters void PopulateConfigurations(const std::vector ¶meters, const std::vector local_size_base, const TransformVector& mul_local_config, const TransformVector& div_local_config, const size_t index, const Configuration &config, std::vector &configuration, const size_t local_mem_max, const Constraints& constraints, const LocalMemSizeInfo& local_mem_size_info, const std::vector& max_work_item_sizes, const size_t max_work_group_size) { // End of the chain: all parameters are considered, store the resulting configuration if it is a // valid one according to the constraints if (index == parameters.size()) { if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info, local_size_base, mul_local_config, div_local_config, max_work_item_sizes, max_work_group_size)) { configuration.push_back(config); } return; } // This loop iterates over all values of the current parameter and calls this function recursively Parameter parameter = parameters[index]; for (auto &value: parameter.second) { auto config_copy = config; config_copy[parameter.first] = value; PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config, index+1, config_copy, configuration, local_mem_max, constraints, local_mem_size_info, max_work_item_sizes, max_work_group_size); } } // Loops over all user-defined constraints to check whether or not the configuration is valid bool ValidConfiguration(const Configuration &config, const size_t local_mem_max, const Constraints& constraints, const LocalMemSizeInfo& local_mem_size_info, const std::vector local_size_base, const TransformVector& mul_local_config, const TransformVector& div_local_config, const std::vector& max_work_item_sizes, const size_t max_work_group_size) { // Iterates over all constraints for (auto &constraint: constraints) { // Finds the values of the parameters auto values = std::vector(constraint.parameters.size()); for (auto i=size_t{0}; i(local_mem_size_info.parameters.size()); for (auto i=size_t{0}; i local_mem_max) { return false; } // Checks the local thread size (both per dimension and in total) const auto local = SetThreadConfiguration(config, local_size_base, mul_local_config, div_local_config); for (auto i=size_t{0}; i max_work_item_sizes[i]) { return false; } } auto local_size = size_t{1}; for (auto &item: local) { local_size *= item; } if (local_size > max_work_group_size) { return false; } // Everything was OK: this configuration is valid return true; } // Multiplies and/or dividers a thread configuration (local/global) std::vector SetThreadConfiguration(const Configuration& config, const std::vector base, const TransformVector& mul_config, const TransformVector& div_config) { auto result = base; for (const auto &multipliers: mul_config) { for (auto i = size_t{0}; i < multipliers.size(); ++i) { result[i] *= config.at(multipliers[i]); } } for (const auto ÷rs: div_config) { for (auto i = size_t{0}; i < dividers.size(); ++i) { result[i] /= config.at(dividers[i]); } } return result; } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/configurations.hpp000066400000000000000000000121011463263031500206410ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune). // This is only used for the optional tuner binaries and not part of the core of CLBlast. // // ================================================================================================= #ifndef CLBLAST_TUNING_CONFIGURATIONS_H_ #define CLBLAST_TUNING_CONFIGURATIONS_H_ #include #include #include #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= using Configuration = std::map; using Parameter = std::pair>; using TransformVector = std::vector>; // Helper structure holding a constraint on parameters. This constraint consists of a constraint // function object and a vector of parameter names represented as strings. using ConstraintFunction = std::function)>; struct Constraint { ConstraintFunction valid_if; std::vector parameters; }; using Constraints = std::vector; // As above, but for local memory size using LocalMemSizeFunction = std::function)>; struct LocalMemSizeInfo { LocalMemSizeFunction local_mem_size; std::vector parameters; }; // ================================================================================================= // Initializes an empty configuration (vector of name/value pairs) and kicks-off the recursive // function to find all configurations. It also applies the user-defined constraints within. std::vector SetConfigurations(const Device& device, const std::vector parameters, const std::vector& local_size_base, const TransformVector& mul_local_config, const TransformVector& div_local_config, const Constraints& constraints, const LocalMemSizeInfo& local_mem_size_info); // Iterates recursively over all permutations of the user-defined parameters. This code creates // multiple chains, in which each chain selects a unique combination of values for all parameters. // At the end of each chain (when all parameters are considered), the function stores the result // into the configuration list. void PopulateConfigurations(const std::vector ¶meters, const std::vector local_size_base, const TransformVector& mul_local_config, const TransformVector& div_local_config, const size_t index, const Configuration &config, std::vector &configuration, const size_t local_mem_max, const Constraints& constraints, const LocalMemSizeInfo& local_mem_size_info, const std::vector& max_work_item_sizes, const size_t max_work_group_size); // Loops over all user-defined constraints to check whether or not the configuration is valid. // Assumes initially all configurations are valid, then returns false if one of the constraints has // not been met. Constraints consist of a user-defined function and a list of parameter names, which // are replaced by parameter values in this function. bool ValidConfiguration(const Configuration &config, const size_t local_mem_max, const Constraints& constraints, const LocalMemSizeInfo& local_mem_size_info, const std::vector local_size_base, const TransformVector& mul_local_config, const TransformVector& div_local_config, const std::vector& max_work_item_sizes, const size_t max_work_group_size); // Processes multipliers and dividers to obtain the final thread configuration std::vector SetThreadConfiguration(const Configuration& config, const std::vector base, const TransformVector& mul_config, const TransformVector& div_config); // ================================================================================================= } // namespace clblast // CLBLAST_TUNING_CONFIGURATIONS_H_ #endif CLBlast-1.6.3/src/tuning/kernels/000077500000000000000000000000001463263031500165465ustar00rootroot00000000000000CLBlast-1.6.3/src/tuning/kernels/copy_fast.cpp000066400000000000000000000052131463263031500212420ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the copy OpenCL kernels. // // ================================================================================================= #include "tuning/kernels/copy_fast.hpp" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings, clblast::CopyTestValidArguments, clblast::CopySetConstraints, clblast::CopyComputeLocalMemSize, clblast::CopySetArguments); break; case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings, clblast::CopyTestValidArguments, clblast::CopySetConstraints, clblast::CopyComputeLocalMemSize, clblast::CopySetArguments); break; case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings, clblast::CopyTestValidArguments, clblast::CopySetConstraints, clblast::CopyComputeLocalMemSize, clblast::CopySetArguments); break; case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings, clblast::CopyTestValidArguments, clblast::CopySetConstraints, clblast::CopyComputeLocalMemSize, clblast::CopySetArguments); break; case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings, clblast::CopyTestValidArguments, clblast::CopySetConstraints, clblast::CopyComputeLocalMemSize, clblast::CopySetArguments); break; } return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/kernels/copy_fast.hpp000066400000000000000000000063411463263031500212520ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the copy OpenCL kernels. // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= // Settings for this kernel (default command-line arguments) TunerDefaults CopyGetTunerDefaults(const int) { auto settings = TunerDefaults(); settings.options = {kArgM, kArgN, kArgAlpha}; settings.default_m = 1024; settings.default_n = 1024; return settings; } // Settings for this kernel (general) template TunerSettings CopyGetTunerSettings(const int, const Arguments &args) { auto settings = TunerSettings(); // Identification of the kernel settings.kernel_family = "copy"; settings.kernel_name = "CopyMatrixFast"; settings.sources = #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/copy_fast.opencl" ; // Buffer sizes settings.size_a = args.m * args.n; settings.size_b = args.m * args.n; // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) settings.inputs = {2, 3}; settings.outputs = {3}; // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; settings.local_size = {1, 1}; settings.local_size_ref = {8, 8}; // Transforms the thread configuration based on the parameters settings.mul_local = {{"COPY_DIMX", "COPY_DIMY"}}; settings.div_global = {{"COPY_VW", "COPY_WPT"}}; // Sets the tuning parameters and their possible values settings.parameters = { {"COPY_DIMX", {8, 16, 32}}, {"COPY_DIMY", {8, 16, 32}}, {"COPY_WPT", {1, 2, 4, 8}}, {"COPY_VW", {1, 2, 4, 8}}, }; // Describes how to compute the performance metrics settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); settings.performance_unit = "GB/s"; return settings; } // Tests for valid arguments template void CopyTestValidArguments(const int, const Arguments &) { } std::vector CopySetConstraints(const int) { return {}; } template LocalMemSizeInfo CopyComputeLocalMemSize(const int) { return { [] (std::vector) -> size_t { return 0; }, {} }; } // Sets the kernel's arguments template void CopySetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { kernel.SetArgument(0, static_cast(args.m)); kernel.SetArgument(1, buffers[2]()); // 2 == A matrix kernel.SetArgument(2, buffers[3]()); // 3 == B matrix kernel.SetArgument(3, GetRealArg(args.alpha)); } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/kernels/copy_pad.cpp000066400000000000000000000051531463263031500210540ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the pad OpenCL kernels. // // ================================================================================================= #include "tuning/kernels/copy_pad.hpp" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings, clblast::PadTestValidArguments, clblast::PadSetConstraints, clblast::PadComputeLocalMemSize, clblast::PadSetArguments); break; case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings, clblast::PadTestValidArguments, clblast::PadSetConstraints, clblast::PadComputeLocalMemSize, clblast::PadSetArguments); break; case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings, clblast::PadTestValidArguments, clblast::PadSetConstraints, clblast::PadComputeLocalMemSize, clblast::PadSetArguments); break; case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings, clblast::PadTestValidArguments, clblast::PadSetConstraints, clblast::PadComputeLocalMemSize, clblast::PadSetArguments); break; case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings, clblast::PadTestValidArguments, clblast::PadSetConstraints, clblast::PadComputeLocalMemSize, clblast::PadSetArguments); break; } return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/kernels/copy_pad.hpp000066400000000000000000000070441463263031500210620ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the pad OpenCL kernels. // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= // Settings for this kernel (default command-line arguments) TunerDefaults PadGetTunerDefaults(const int) { auto settings = TunerDefaults(); settings.options = {kArgM, kArgN, kArgAlpha}; settings.default_m = 1024; settings.default_n = 1024; return settings; } // Settings for this kernel (general) template TunerSettings PadGetTunerSettings(const int, const Arguments &args) { auto settings = TunerSettings(); // Identification of the kernel settings.kernel_family = "pad"; settings.kernel_name = "CopyPadMatrix"; settings.sources = #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/copy_pad.opencl" ; // Buffer sizes settings.size_a = args.m * args.n; settings.size_b = args.m * args.n; // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) settings.inputs = {2, 3}; settings.outputs = {3}; // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; settings.local_size = {1, 1}; settings.local_size_ref = {8, 8}; // Transforms the thread configuration based on the parameters settings.mul_local = {{"PAD_DIMX", "PAD_DIMY"}}; settings.div_global = {{"PAD_WPTX", "PAD_WPTY"}}; // Sets the tuning parameters and their possible values settings.parameters = { {"PAD_DIMX", {8, 16, 32}}, {"PAD_DIMY", {8, 16, 32}}, {"PAD_WPTX", {1, 2, 4}}, {"PAD_WPTY", {1, 2, 4}}, }; // Describes how to compute the performance metrics settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); settings.performance_unit = "GB/s"; return settings; } // Tests for valid arguments template void PadTestValidArguments(const int, const Arguments &) { } std::vector PadSetConstraints(const int) { return {}; } template LocalMemSizeInfo PadComputeLocalMemSize(const int) { return { [] (std::vector) -> size_t { return 0; }, {} }; } // Sets the kernel's arguments template void PadSetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { kernel.SetArgument(0, static_cast(args.m)); kernel.SetArgument(1, static_cast(args.n)); kernel.SetArgument(2, static_cast(args.m)); kernel.SetArgument(3, 0); kernel.SetArgument(4, buffers[2]()); // 2 == A matrix kernel.SetArgument(5, static_cast(args.m)); kernel.SetArgument(6, static_cast(args.n)); kernel.SetArgument(7, static_cast(args.m)); kernel.SetArgument(8, 0); kernel.SetArgument(9, buffers[3]()); // 3 == B matrix kernel.SetArgument(10, GetRealArg(args.alpha)); kernel.SetArgument(11, 0); } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/kernels/invert.cpp000066400000000000000000000053061463263031500205650ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the invert OpenCL kernels. // // ================================================================================================= #include "tuning/kernels/invert.hpp" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings, clblast::InvertTestValidArguments, clblast::InvertSetConstraints, clblast::InvertComputeLocalMemSize, clblast::InvertSetArguments); break; case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings, clblast::InvertTestValidArguments, clblast::InvertSetConstraints, clblast::InvertComputeLocalMemSize, clblast::InvertSetArguments); break; case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings, clblast::InvertTestValidArguments, clblast::InvertSetConstraints, clblast::InvertComputeLocalMemSize, clblast::InvertSetArguments); break; case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings, clblast::InvertTestValidArguments, clblast::InvertSetConstraints, clblast::InvertComputeLocalMemSize, clblast::InvertSetArguments); break; case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings, clblast::InvertTestValidArguments, clblast::InvertSetConstraints, clblast::InvertComputeLocalMemSize, clblast::InvertSetArguments); break; } return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/kernels/invert.hpp000066400000000000000000000101311463263031500205620ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the invert OpenCL kernels. // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= // Settings for this kernel (default command-line arguments) TunerDefaults InvertGetTunerDefaults(const int) { auto settings = TunerDefaults(); settings.options = {kArgN, kArgM, kArgK}; settings.default_n = 128; // dimension of input matrix 'n' settings.default_m = 64; // block size settings.default_k = 16; // current size return settings; } // Settings for this kernel (general) template TunerSettings InvertGetTunerSettings(const int, const Arguments &args) { auto settings = TunerSettings(); // Identification of the kernel settings.kernel_family = "invert"; settings.kernel_name = "TripleMatMul16Part1Lower"; settings.sources = "#define ROUTINE_INVERT" #include "../src/kernels/level3/invert_diagonal_blocks_part1.opencl" #include "../src/kernels/level3/invert_diagonal_blocks_part2.opencl" ; // Buffer sizes settings.size_a = args.n * args.n + args.a_offset; settings.size_b = Ceil(args.n, args.m) * args.m; // Ceil(n, block_size) * block_size // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) settings.inputs = {2, 3}; settings.outputs = {3}; // Sets the base thread configuration const auto num_pages = CeilDiv(args.n, args.k * 2); // CeilDiv(n, current_size*2) settings.global_size = {args.k / 4, num_pages * (args.k / 16) * 4}; settings.global_size_ref = settings.global_size; settings.local_size = {1, 1}; settings.local_size_ref = {4, 4}; // Transforms the thread configuration based on the parameters settings.mul_local = {{"TMMWGSX", "TMMWGSY"}}; settings.div_global = {{}}; // Sets the tuning parameters and their possible values // TODO: Make these actually tunable, apart from LOCALPAD settings.parameters = { {"INTERNAL_BLOCK_SIZE", {16}}, {"LOCALPAD", {0, 1}}, {"TMMWGSX", {4}}, {"TMMWGSY", {4}}, }; // Describes how to compute the performance metrics settings.metric_amount = 1 * GetBytes(args.precision); settings.performance_unit = "N/A"; return settings; } // Tests for valid arguments template void InvertTestValidArguments(const int, const Arguments &args) { if (!(args.k == 16)) { throw std::runtime_error("'TripleMatMul16Part1Lower' requires 'k' to be 16"); } } std::vector InvertSetConstraints(const int) { return {}; } template LocalMemSizeInfo InvertComputeLocalMemSize(const int) { return { [] (std::vector v) -> size_t { return GetBytes(PrecisionValue()) * (16 + v[0]) * 16; }, {"LOCALPAD"} }; } // Sets the kernel's arguments template void InvertSetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { const auto num_pages = CeilDiv(args.n, args.k * 2); // CeilDiv(n, current_size*2) kernel.SetArgument(0, static_cast(args.n)); // n kernel.SetArgument(1, buffers[2]()); // 2 == A matrix kernel.SetArgument(2, 0); // a_offset kernel.SetArgument(3, static_cast(args.n)); // a_ld kernel.SetArgument(4, buffers[3]()); // 3 == B matrix kernel.SetArgument(5, static_cast(args.k)); // current_size kernel.SetArgument(6, static_cast(num_pages)); // num_pages kernel.SetArgument(7, static_cast(args.m)); // block_size } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/kernels/transpose_fast.cpp000066400000000000000000000054531463263031500223140ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the transpose OpenCL kernels. // // ================================================================================================= #include "tuning/kernels/transpose_fast.hpp" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings, clblast::TransposeTestValidArguments, clblast::TransposeSetConstraints, clblast::TransposeComputeLocalMemSize, clblast::TransposeSetArguments); break; case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings, clblast::TransposeTestValidArguments, clblast::TransposeSetConstraints, clblast::TransposeComputeLocalMemSize, clblast::TransposeSetArguments); break; case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings, clblast::TransposeTestValidArguments, clblast::TransposeSetConstraints, clblast::TransposeComputeLocalMemSize, clblast::TransposeSetArguments); break; case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings, clblast::TransposeTestValidArguments, clblast::TransposeSetConstraints, clblast::TransposeComputeLocalMemSize, clblast::TransposeSetArguments); break; case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings, clblast::TransposeTestValidArguments, clblast::TransposeSetConstraints, clblast::TransposeComputeLocalMemSize, clblast::TransposeSetArguments); break; } return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/kernels/transpose_fast.hpp000066400000000000000000000066171463263031500223240ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the transpose OpenCL kernels. // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= // Settings for this kernel (default command-line arguments) TunerDefaults TransposeGetTunerDefaults(const int) { auto settings = TunerDefaults(); settings.options = {kArgM, kArgN, kArgAlpha}; settings.default_m = 1024; settings.default_n = 1024; return settings; } // Settings for this kernel (general) template TunerSettings TransposeGetTunerSettings(const int, const Arguments &args) { auto settings = TunerSettings(); // Identification of the kernel settings.kernel_family = "transpose"; settings.kernel_name = "TransposeMatrixFast"; settings.sources = #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/transpose_fast.opencl" ; // Buffer sizes settings.size_a = args.m * args.n; settings.size_b = args.m * args.n; // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) settings.inputs = {2, 3}; settings.outputs = {3}; // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; settings.local_size = {1, 1}; settings.local_size_ref = {8, 8}; // Transforms the thread configuration based on the parameters settings.mul_local = {{"TRA_DIM", "TRA_DIM"}}; settings.div_global = {{"TRA_WPT", "TRA_WPT"}}; // Sets the tuning parameters and their possible values settings.parameters = { {"TRA_DIM", {4, 8, 16, 32, 64}}, {"TRA_WPT", {1, 2, 4, 8, 16}}, {"TRA_PAD", {0, 1}}, {"TRA_SHUFFLE", {0, 1}}, }; // Describes how to compute the performance metrics settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); settings.performance_unit = "GB/s"; return settings; } // Tests for valid arguments template void TransposeTestValidArguments(const int, const Arguments &) { } std::vector TransposeSetConstraints(const int) { return {}; } template LocalMemSizeInfo TransposeComputeLocalMemSize(const int) { return { [] (std::vector v) -> size_t { return GetBytes(PrecisionValue()) * v[1] * (v[1] * v[0]) * (v[0] + v[2]); }, {"TRA_DIM", "TRA_WPT", "TRA_PAD"} }; } // Sets the kernel's arguments template void TransposeSetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { kernel.SetArgument(0, static_cast(args.m)); kernel.SetArgument(1, buffers[2]()); // 2 == A matrix kernel.SetArgument(2, buffers[3]()); // 3 == B matrix kernel.SetArgument(3, GetRealArg(args.alpha)); } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/kernels/transpose_pad.cpp000066400000000000000000000056101463263031500221160ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the pad-transpose OpenCL kernels. // // ================================================================================================= #include "tuning/kernels/transpose_pad.hpp" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings, clblast::PadtransposeTestValidArguments, clblast::PadtransposeSetConstraints, clblast::PadtransposeComputeLocalMemSize, clblast::PadtransposeSetArguments); break; case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings, clblast::PadtransposeTestValidArguments, clblast::PadtransposeSetConstraints, clblast::PadtransposeComputeLocalMemSize, clblast::PadtransposeSetArguments); break; case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings, clblast::PadtransposeTestValidArguments, clblast::PadtransposeSetConstraints, clblast::PadtransposeComputeLocalMemSize, clblast::PadtransposeSetArguments); break; case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings, clblast::PadtransposeTestValidArguments, clblast::PadtransposeSetConstraints, clblast::PadtransposeComputeLocalMemSize, clblast::PadtransposeSetArguments); break; case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings, clblast::PadtransposeTestValidArguments, clblast::PadtransposeSetConstraints, clblast::PadtransposeComputeLocalMemSize, clblast::PadtransposeSetArguments); break; } return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/kernels/transpose_pad.hpp000066400000000000000000000073731463263031500221330ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the pad-transpose OpenCL kernels. // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= // Settings for this kernel (default command-line arguments) TunerDefaults PadtransposeGetTunerDefaults(const int) { auto settings = TunerDefaults(); settings.options = {kArgM, kArgN, kArgAlpha}; settings.default_m = 1024; settings.default_n = 1024; return settings; } // Settings for this kernel (general) template TunerSettings PadtransposeGetTunerSettings(const int, const Arguments &args) { auto settings = TunerSettings(); // Identification of the kernel settings.kernel_family = "padtranspose"; settings.kernel_name = "TransposePadMatrix"; settings.sources = #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/transpose_pad.opencl" ; // Buffer sizes settings.size_a = args.m * args.n; settings.size_b = args.m * args.n; // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) settings.inputs = {2, 3}; settings.outputs = {3}; // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; settings.local_size = {1, 1}; settings.local_size_ref = {8, 8}; // Transforms the thread configuration based on the parameters settings.mul_local = {{"PADTRA_TILE", "PADTRA_TILE"}}; settings.div_global = {{"PADTRA_WPT", "PADTRA_WPT"}}; // Sets the tuning parameters and their possible values settings.parameters = { {"PADTRA_TILE", {8, 16, 32, 64}}, {"PADTRA_WPT", {1, 2, 4, 8, 16}}, {"PADTRA_PAD", {0, 1}}, }; // Describes how to compute the performance metrics settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); settings.performance_unit = "GB/s"; return settings; } // Tests for valid arguments template void PadtransposeTestValidArguments(const int, const Arguments &) { } std::vector PadtransposeSetConstraints(const int) { return {}; } template LocalMemSizeInfo PadtransposeComputeLocalMemSize(const int) { return { [] (std::vector v) -> size_t { return GetBytes(PrecisionValue()) * (v[1] * v[0]) * (v[1] * v[0] + v[2]); }, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"} }; } // Sets the kernel's arguments template void PadtransposeSetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { kernel.SetArgument(0, static_cast(args.m)); kernel.SetArgument(1, static_cast(args.n)); kernel.SetArgument(2, static_cast(args.m)); kernel.SetArgument(3, 0); kernel.SetArgument(4, buffers[2]()); // 2 == A matrix kernel.SetArgument(5, static_cast(args.n)); kernel.SetArgument(6, static_cast(args.m)); kernel.SetArgument(7, static_cast(args.n)); kernel.SetArgument(8, 0); kernel.SetArgument(9, buffers[3]()); // 3 == B matrix kernel.SetArgument(10, GetRealArg(args.alpha)); kernel.SetArgument(11, 0); } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/kernels/xaxpy.cpp000066400000000000000000000052461463263031500204320ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the xaxpy OpenCL kernels. // // ================================================================================================= #include "tuning/kernels/xaxpy.hpp" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings, clblast::XaxpyTestValidArguments, clblast::XaxpySetConstraints, clblast::XaxpyComputeLocalMemSize, clblast::XaxpySetArguments); break; case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings, clblast::XaxpyTestValidArguments, clblast::XaxpySetConstraints, clblast::XaxpyComputeLocalMemSize, clblast::XaxpySetArguments); break; case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings, clblast::XaxpyTestValidArguments, clblast::XaxpySetConstraints, clblast::XaxpyComputeLocalMemSize, clblast::XaxpySetArguments); break; case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings, clblast::XaxpyTestValidArguments, clblast::XaxpySetConstraints, clblast::XaxpyComputeLocalMemSize, clblast::XaxpySetArguments); break; case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings, clblast::XaxpyTestValidArguments, clblast::XaxpySetConstraints, clblast::XaxpyComputeLocalMemSize, clblast::XaxpySetArguments); break; } return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/kernels/xaxpy.hpp000066400000000000000000000063511463263031500204350ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the xaxpy OpenCL kernels. // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= // Settings for this kernel (default command-line arguments) TunerDefaults XaxpyGetTunerDefaults(const int) { auto settings = TunerDefaults(); settings.options = {kArgN, kArgAlpha}; settings.default_n = 4096*1024; return settings; } // Settings for this kernel (general) template TunerSettings XaxpyGetTunerSettings(const int, const Arguments &args) { auto settings = TunerSettings(); // Identification of the kernel settings.kernel_family = "xaxpy"; settings.kernel_name = "XaxpyFastest"; settings.sources = #include "../src/kernels/level1/level1.opencl" #include "../src/kernels/level1/xaxpy.opencl" ; // Buffer sizes settings.size_x = args.n; settings.size_y = args.n; // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) settings.inputs = {0, 1}; settings.outputs = {1}; // Sets the base thread configuration settings.global_size = {args.n}; settings.global_size_ref = settings.global_size; settings.local_size = {1}; settings.local_size_ref = {64}; // Transforms the thread configuration based on the parameters settings.mul_local = {{"WGS"}}; settings.div_global = {{"WPT"},{"VW"}}; // Sets the tuning parameters and their possible values settings.parameters = { {"WGS", {64, 128, 256, 512, 1024, 2048}}, {"WPT", {1, 2, 4, 8}}, {"VW", {1, 2, 4, 8}}, }; // Describes how to compute the performance metrics settings.metric_amount = 3 * args.n * GetBytes(args.precision); settings.performance_unit = "GB/s"; return settings; } // Tests for valid arguments template void XaxpyTestValidArguments(const int, const Arguments &args) { if (!IsMultiple(args.n, 64)) { throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW"); } } std::vector XaxpySetConstraints(const int) { return {}; } template LocalMemSizeInfo XaxpyComputeLocalMemSize(const int) { return { [] (std::vector) -> size_t { return 0; }, {} }; } // Sets the kernel's arguments template void XaxpySetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { kernel.SetArgument(0, static_cast(args.n)); kernel.SetArgument(1, GetRealArg(args.alpha)); kernel.SetArgument(2, buffers[0]()); // 0 == X vector kernel.SetArgument(3, buffers[1]()); // 1 == Y vector } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/kernels/xconvgemm.cpp000066400000000000000000000044301463263031500212560ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the convgemm kernels. // // ================================================================================================= #include "tuning/kernels/xconvgemm.hpp" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Function to tune a specific variation V (not within the clblast namespace) template void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::Tuner(argc, argv, V, clblast::XConvGemmGetTunerDefaults, clblast::XConvGemmGetTunerSettings, clblast::XConvGemmTestValidArguments, clblast::XConvGemmSetConstraints, clblast::XConvGemmComputeLocalMemSize, clblast::XConvGemmSetArguments); break; case clblast::Precision::kSingle: clblast::Tuner(argc, argv, V, clblast::XConvGemmGetTunerDefaults, clblast::XConvGemmGetTunerSettings, clblast::XConvGemmTestValidArguments, clblast::XConvGemmSetConstraints, clblast::XConvGemmComputeLocalMemSize, clblast::XConvGemmSetArguments); break; case clblast::Precision::kDouble: clblast::Tuner(argc, argv, V, clblast::XConvGemmGetTunerDefaults, clblast::XConvGemmGetTunerSettings, clblast::XConvGemmTestValidArguments, clblast::XConvGemmSetConstraints, clblast::XConvGemmComputeLocalMemSize, clblast::XConvGemmSetArguments); break; } } // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { StartVariation<1>(argc, argv); return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/kernels/xconvgemm.hpp000066400000000000000000000167521463263031500212750ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the ConvGemm kernels. These kernels are based on the GEMM // direct kernel and will use those parameters, this tuner is just optional to use for advanced // users. // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= // Helper functions template size_t OutputHeight(const Arguments &args) { const auto size = args.height + 2 * args.pad_h; const auto padding = args.dilation_h * (args.kernel_h - 1) + 1; if (size >= padding) { return (size - padding) / args.stride_h + 1; } return 1; } template size_t OutputWidth(const Arguments &args) { const auto size = args.width + 2 * args.pad_w; const auto padding = args.dilation_w * (args.kernel_w - 1) + 1; if (size >= padding) { return (size - padding) / args.stride_w + 1; } return 1; } // Settings for this kernel (default command-line arguments) TunerDefaults XConvGemmGetTunerDefaults(const int) { auto settings = TunerDefaults(); settings.options = {kArgChannels, kArgHeight, kArgWidth, kArgKernelH, kArgKernelW, kArgNumKernels, kArgBatchCount, kArgFraction}; settings.channels = 32; settings.height = 66; settings.width = 66; // num_patches = 64x64 = 4096 settings.kernel_h = 3; settings.kernel_w = 3; settings.num_kernels = 32; settings.default_batch_count = 16; settings.default_fraction = 1.0; settings.default_num_runs = 2; return settings; } // Settings for this kernel (general) template TunerSettings XConvGemmGetTunerSettings(const int, const Arguments &args) { auto settings = TunerSettings(); // Identification of the kernel settings.kernel_family = "xconvgemm"; settings.kernel_name = "XconvgemmNormal"; settings.sources = "#define ROUTINE_CONVGEMM" #include "../src/kernels/level3/xgemm_direct_part1.opencl" #include "../src/kernels/level3/xgemm_direct_part2.opencl" #include "../src/kernels/level3/xgemm_direct_part3.opencl" #include "../src/kernels/levelx/xconvgemm_part1.opencl" #include "../src/kernels/levelx/xconvgemm_part2.opencl" ; // Helper variables const auto patch_size = args.kernel_h * args.kernel_w * args.channels; const auto num_patches = OutputHeight(args) * OutputWidth(args); // Buffer sizes settings.size_a = args.batch_count * args.channels * args.height * args.width; settings.size_b = args.num_kernels * args.channels * args.kernel_h * args.kernel_w; settings.size_c = args.batch_count * args.num_kernels * OutputHeight(args) * OutputWidth(args); // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) settings.inputs = {2, 3, 4}; settings.outputs = {4}; // Sets the base thread configuration settings.global_size = {num_patches, args.num_kernels, args.batch_count}; settings.global_size_ref = settings.global_size; settings.local_size = {1, 1, 1}; settings.local_size_ref = {8, 8, 1}; // Transforms the thread configuration based on the parameters settings.mul_local = {{"MDIMCD", "NDIMCD"}}; settings.mul_global = {{"MDIMCD", "NDIMCD"}}; settings.div_global = {{"WGD", "WGD"}}; // Sets the tuning parameters and their possible values settings.parameters = { {"WGD", {8, 16, 32}}, {"MDIMCD", {8, 16, 32}}, {"NDIMCD", {8, 16, 32}}, {"MDIMAD", {8, 16, 32}}, {"NDIMBD", {8, 16, 32}}, {"KWID", {1}}, {"VWMD", {1, 2, 4, 8}}, {"VWND", {1, 2, 4, 8}}, {"PADA", {0}}, {"PADB", {0}}, }; // Describes how to compute the performance metrics settings.metric_amount = args.batch_count * 2 * num_patches * args.num_kernels * patch_size; settings.performance_unit = "GFLOPS"; return settings; } // Tests for valid arguments template void XConvGemmTestValidArguments(const int, const Arguments &) { } std::vector XConvGemmSetConstraints(const int) { auto constraints = std::vector(); auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; // Requirement for unrolling the WGD loop constraints.push_back({MultipleOfX, {"WGD", "KWID"}}); // Required for integer MWID and NWID constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}}); constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}}); // Required for integer MWIAD and NWIBD constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}}); constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}}); // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...) constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}}); constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}}); return constraints; } template LocalMemSizeInfo XConvGemmComputeLocalMemSize(const int) { return { [] (std::vector v) -> size_t { return GetBytes(PrecisionValue()) * ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))); }, {"WGD", "PADA", "PADB"} }; } // Sets the kernel's arguments template void XConvGemmSetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { const auto output_h = OutputHeight(args); const auto output_w = OutputWidth(args); const auto patch_size = args.kernel_h * args.kernel_w * args.channels; const auto num_patches = output_h * output_w; const auto result_stride = args.num_kernels * output_h * output_w; kernel.SetArgument(0, static_cast(num_patches)); kernel.SetArgument(1, static_cast(args.num_kernels)); kernel.SetArgument(2, static_cast(patch_size)); kernel.SetArgument(3, buffers[3]()); // 3 == B matrix ==> kernel buffer kernel.SetArgument(4, 0); // kernel offset kernel.SetArgument(5, buffers[4]()); // 4 == C matrix ==> result buffer kernel.SetArgument(6, 0); // result offset kernel.SetArgument(7, static_cast(result_stride)); kernel.SetArgument(8, buffers[2]()); // 2 == A matrix ==> image buffer kernel.SetArgument(9, 0); // image offset kernel.SetArgument(10, static_cast(args.height)); kernel.SetArgument(11, static_cast(args.width)); kernel.SetArgument(12, static_cast(args.channels)); kernel.SetArgument(13, static_cast(args.kernel_h)); kernel.SetArgument(14, static_cast(args.kernel_w)); kernel.SetArgument(15, 0); // pad_h kernel.SetArgument(16, 0); // pad_w kernel.SetArgument(17, 1); // stride_h kernel.SetArgument(18, 1); // stride_w kernel.SetArgument(19, 1); // dilation_h kernel.SetArgument(20, 1); // dilation_w kernel.SetArgument(21, static_cast(output_h)); kernel.SetArgument(22, static_cast(output_w)); } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/kernels/xdot.cpp000066400000000000000000000056701463263031500202400ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the xdot OpenCL kernels. Note that the results are // not verified, since the result is not final and depends on the WGS2 parameter. // // ================================================================================================= #include "tuning/kernels/xdot.hpp" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Function to tune a specific variation V (not within the clblast namespace) template void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::Tuner(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings, clblast::XdotTestValidArguments, clblast::XdotSetConstraints, clblast::XdotComputeLocalMemSize, clblast::XdotSetArguments); break; case clblast::Precision::kSingle: clblast::Tuner(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings, clblast::XdotTestValidArguments, clblast::XdotSetConstraints, clblast::XdotComputeLocalMemSize, clblast::XdotSetArguments); break; case clblast::Precision::kDouble: clblast::Tuner(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings, clblast::XdotTestValidArguments, clblast::XdotSetConstraints, clblast::XdotComputeLocalMemSize, clblast::XdotSetArguments); break; case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings, clblast::XdotTestValidArguments, clblast::XdotSetConstraints, clblast::XdotComputeLocalMemSize, clblast::XdotSetArguments); break; case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings, clblast::XdotTestValidArguments, clblast::XdotSetConstraints, clblast::XdotComputeLocalMemSize, clblast::XdotSetArguments); break; } } // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { StartVariation<1>(argc, argv); StartVariation<2>(argc, argv); return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/kernels/xdot.hpp000066400000000000000000000077331463263031500202470ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the xdot OpenCL kernels. Note that the results are // not verified, since the result is not final and depends on the WGS2 parameter. // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= // Settings for this kernel (default command-line arguments) TunerDefaults XdotGetTunerDefaults(const int) { auto settings = TunerDefaults(); settings.options = {kArgN}; settings.default_n = 2*1024*1024; return settings; } // Settings for this kernel (general) template TunerSettings XdotGetTunerSettings(const int V, const Arguments &args) { auto settings = TunerSettings(); // Identification of the kernel settings.kernel_family = "xdot_"+std::to_string(V); settings.kernel_name = (V==1) ? "Xdot" : "XdotEpilogue"; settings.sources = #include "../src/kernels/level1/xdot.opencl" ; // Buffer sizes settings.size_x = args.n; settings.size_y = args.n; settings.size_temp = args.n; // Worst case // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) settings.inputs = {0, 1, 5}; settings.outputs = {}; // no output checking // Sets the base thread configuration settings.global_size = (V==1) ? std::vector{2*64} : std::vector{1}; settings.global_size_ref = (V==1) ? std::vector{2*64*64} : std::vector{64}; settings.local_size = {1}; settings.local_size_ref = {64}; // Transforms the thread configuration based on the parameters settings.mul_local = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; settings.mul_global = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; // Sets the tuning parameters and their possible values settings.parameters = { {"WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}}, }; // Describes how to compute the performance metrics settings.metric_amount = (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); settings.performance_unit = (V==1) ? "GB/s" : "N/A"; return settings; } // Tests for valid arguments template void XdotTestValidArguments(const int, const Arguments &) { } std::vector XdotSetConstraints(const int) { return {}; } template LocalMemSizeInfo XdotComputeLocalMemSize(const int V) { return { [] (std::vector v) -> size_t { return GetBytes(PrecisionValue()) * v[0]; }, {"WGS"+std::to_string(V)} };} // Sets the kernel's arguments template void XdotSetArguments(const int V, Kernel &kernel, const Arguments &args, std::vector>& buffers) { if (V == 1) { kernel.SetArgument(0, static_cast(args.n)); kernel.SetArgument(1, buffers[0]()); // 0 == X vector kernel.SetArgument(2, 0); kernel.SetArgument(3, 1); kernel.SetArgument(4, buffers[1]()); // 1 == Y vector kernel.SetArgument(5, 0); kernel.SetArgument(6, 1); kernel.SetArgument(7, buffers[5]()); // 5 == temp; no output checking - size varies kernel.SetArgument(8, static_cast(false)); } else { kernel.SetArgument(0, buffers[5]()); // 5 == temp kernel.SetArgument(1, buffers[0]()); // 0 == X vector; no output checking - size varies kernel.SetArgument(2, 0); } } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/kernels/xgemm.cpp000066400000000000000000000065061463263031500203760ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the xgemm OpenCL kernels. // // ================================================================================================= #include "tuning/kernels/xgemm.hpp" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Function to tune a specific variation V (not within the clblast namespace) template void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::Tuner(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings, clblast::XgemmTestValidArguments, clblast::XgemmSetConstraints, clblast::XgemmComputeLocalMemSize, clblast::XgemmSetArguments); break; case clblast::Precision::kSingle: clblast::Tuner(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings, clblast::XgemmTestValidArguments, clblast::XgemmSetConstraints, clblast::XgemmComputeLocalMemSize, clblast::XgemmSetArguments); break; case clblast::Precision::kDouble: clblast::Tuner(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings, clblast::XgemmTestValidArguments, clblast::XgemmSetConstraints, clblast::XgemmComputeLocalMemSize, clblast::XgemmSetArguments); break; case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings, clblast::XgemmTestValidArguments, clblast::XgemmSetConstraints, clblast::XgemmComputeLocalMemSize, clblast::XgemmSetArguments); break; case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings, clblast::XgemmTestValidArguments, clblast::XgemmSetConstraints, clblast::XgemmComputeLocalMemSize, clblast::XgemmSetArguments); break; } } // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { printf("* (1/4) Tuning main GEMM kernel (GEMMK == 0) for fixed set of parameters\n\n"); StartVariation<1>(argc, argv); printf("* (2/4) Tuning main GEMM kernel (GEMMK == 0) for random parameters out of larger set\n\n"); StartVariation<2>(argc, argv); printf("* (3/4) Tuning secondary GEMM kernel (GEMMK == 1) for fixed set of parameters\n\n"); StartVariation<11>(argc, argv); printf("* (4/4) Tuning secondary GEMM kernel (GEMMK == 1) for random parameters out of larger set\n\n"); StartVariation<12>(argc, argv); return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/kernels/xgemm.hpp000066400000000000000000000223471463263031500204040ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the xgemm OpenCL kernels. There are multiple variations: // - V==1: Kernel 0: This tests some limited set of tuning parameters exhaustively. // - V==2: Kernel 0: This tests a much larger set of parameters by randomly sampling a subset. // - V==11: Kernel 1: This tests some limited set of tuning parameters exhaustively. // - V==12: Kernel 1: This tests a much larger set of parameters by randomly sampling a subset. // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= // Settings for this kernel (default command-line arguments) TunerDefaults XgemmGetTunerDefaults(const int V) { auto settings = TunerDefaults(); settings.options = {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, kArgHeuristicSelection, kArgPsoSwarmSize, kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; settings.default_m = 1024; settings.default_n = 1024; settings.default_k = 1024; settings.default_fraction = (V == 1 || V == 11) ? 1.0 : (V == 2) ? 512.0 : 128.0; // test all or sample randomly settings.default_num_runs = 2; return settings; } // Settings for this kernel (general) template TunerSettings XgemmGetTunerSettings(const int V, const Arguments &args) { auto settings = TunerSettings(); // Identification of the kernel settings.kernel_family = "xgemm_" + ToString(V); settings.kernel_name = "Xgemm"; settings.sources = (V == 11 || V == 12) ? "#define GEMMK 1" : "#define GEMMK 0"; settings.sources += #include "../src/kernels/level3/xgemm_part1.opencl" #include "../src/kernels/level3/xgemm_part2.opencl" ; settings.sources += #include "../src/kernels/level3/xgemm_part3.opencl" #include "../src/kernels/level3/xgemm_part4.opencl" ; // Buffer sizes settings.size_a = args.m * args.k; settings.size_b = args.n * args.k; settings.size_c = args.m * args.n; // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) settings.inputs = {2, 3, 4}; settings.outputs = {4}; // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; settings.local_size = {1, 1}; settings.local_size_ref = {8, 8}; // Transforms the thread configuration based on the parameters settings.mul_local = {{"MDIMC", "NDIMC"}}; settings.mul_global = {{"MDIMC", "NDIMC"}}; settings.div_global = {{"MWG", "NWG"}}; // Sets the tuning parameters and their possible values if (V == 1) { // Kernel 0: limited subset of tuning parameters - but explorable exhaustively settings.parameters = { {"GEMMK", {0}}, {"MWG", {16, 32, 64}}, {"NWG", {16, 32, 64}}, {"KWG", {32}}, {"MDIMC", {8, 16, 32}}, {"NDIMC", {8, 16, 32}}, {"MDIMA", {8, 16, 32}}, {"NDIMB", {8, 16, 32}}, {"KWI", {2}}, {"VWM", {1, 2, 4}}, {"VWN", {1, 2, 4}}, {"STRM", {0}}, {"STRN", {0}}, {"SA", {0, 1}}, {"SB", {0, 1}}, {"KREG", {1}} }; } else if (V == 2) { // Kernel 0: a lot more tuning parameters - has to be sampled randomly, too much to test all settings.parameters = { {"GEMMK", {0}}, {"MWG", {16, 32, 64, 128}}, {"NWG", {16, 32, 64, 128}}, {"KWG", {16, 32}}, {"MDIMC", {8, 16, 32}}, {"NDIMC", {8, 16, 32}}, {"MDIMA", {8, 16, 32}}, {"NDIMB", {8, 16, 32}}, {"KWI", {2}}, {"VWM", {1, 2, 4, 8}}, {"VWN", {1, 2, 4, 8}}, {"STRM", {0, 1}}, {"STRN", {0, 1}}, {"SA", {0, 1}}, {"SB", {0, 1}}, {"KREG", {1}} }; } else if (V == 11) { // Kernel 1: limited subset of tuning parameters - but explorable exhaustively settings.parameters = { {"GEMMK", {1}}, {"MWG", {16, 32, 64}}, {"NWG", {16, 32, 64}}, {"KWG", {1}}, {"MDIMC", {4, 8, 16}}, {"NDIMC", {4, 8, 16}}, {"MDIMA", {4, 8, 16}}, {"NDIMB", {4, 8, 16}}, {"KWI", {1}}, {"VWM", {1, 2, 4}}, {"VWN", {1, 2, 4}}, {"STRM", {0}}, {"STRN", {0}}, {"SA", {0}}, {"SB", {0}}, {"KREG", {1, 2, 4}} }; } else if (V == 12) { // Kernel 1: a lot more tuning parameters - has to be sampled randomly, too much to test all settings.parameters = { {"GEMMK", {1}}, {"MWG", {8, 16, 32, 64, 128}}, {"NWG", {8, 16, 32, 64, 128}}, {"KWG", {1}}, {"MDIMC", {2, 4, 8, 16, 32}}, {"NDIMC", {2, 4, 8, 16, 32}}, {"MDIMA", {2, 4, 8, 16, 32}}, {"NDIMB", {2, 4, 8, 16, 32}}, {"KWI", {1}}, {"VWM", {1, 2, 4, 8}}, {"VWN", {1, 2, 4, 8}}, {"STRM", {0}}, {"STRN", {0}}, {"SA", {0}}, {"SB", {0}}, {"KREG", {1, 2, 4, 8, 16}} }; } // Describes how to compute the performance metrics if((args.precision == Precision::kComplexSingle) || (args.precision == Precision::kComplexDouble)) { // complex flops settings.metric_amount = args.m * args.n * (8 * args.k - 2); } else { // scalar flops settings.metric_amount = args.m * args.n * (2 * args.k - 1); } settings.performance_unit = "GFLOPS"; return settings; } // Tests for valid arguments template void XgemmTestValidArguments(const int V, const Arguments &args) { const auto mwg_max = (V == 1 || V == 11) ? 64 : 128; const auto nwg_max = (V == 1 || V == 11) ? 64 : 128; if (!IsMultiple(args.m, mwg_max)) { throw std::runtime_error("'Xgemm' kernel requires 'm' to be a multiple of MWG (max " + ToString(mwg_max) + ")"); } if (!IsMultiple(args.n, nwg_max)) { throw std::runtime_error("'Xgemm' kernel requires 'n' to be a multiple of NWG (max " + ToString(nwg_max) + ")"); } } std::vector XgemmSetConstraints(const int V) { auto constraints = std::vector(); auto IsEqual = [] (std::vector v) { return v[0] == v[1]; }; auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; // Requirement for unrolling the KWG loop constraints.push_back({MultipleOfX, {"KWG", "KWI"}}); // Required for integer MWI and NWI constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}}); constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}}); // Required for integer MWIA and NWIB constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}}); constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}}); if (V == 1 || V == 2) { // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}}); constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}}); } if (V == 11 || V == 12) { // KREG has to be a multiple of VWN constraints.push_back({MultipleOfX, {"KREG", "VWN"}}); } // Extra constraints for kernel 1 to limit the set of options significantly if (V == 11 || V == 12) { constraints.push_back({IsEqual, {"MDIMC", "MDIMA"}}); constraints.push_back({IsEqual, {"NDIMC", "NDIMB"}}); } // Extra constraints for kernel 0 variation 1 to limit the set of options significantly if (V == 1) { constraints.push_back({IsEqual, {"MDIMC", "MDIMA"}}); constraints.push_back({IsEqual, {"NDIMC", "NDIMB"}}); constraints.push_back({IsEqual, {"SA", "SB"}}); } // Extra constraints for kernel 1 variation 11 to limit the set of options significantly if (V == 11) { constraints.push_back({IsEqual, {"VWN", "VWM"}}); } return constraints; } template LocalMemSizeInfo XgemmComputeLocalMemSize(const int) { return { [] (std::vector v) -> size_t { return GetBytes(PrecisionValue()) * ((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5])); }, {"SA", "KWG", "MWG", "SB", "KWG", "NWG"} }; } // Sets the kernel's arguments template void XgemmSetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { kernel.SetArgument(0, static_cast(args.m)); kernel.SetArgument(1, static_cast(args.n)); kernel.SetArgument(2, static_cast(args.k)); kernel.SetArgument(3, GetRealArg(args.alpha)); kernel.SetArgument(4, GetRealArg(args.beta)); kernel.SetArgument(5, buffers[2]()); // 2 == A matrix kernel.SetArgument(6, buffers[3]()); // 3 == B matrix kernel.SetArgument(7, buffers[4]()); // 4 == C matrix kernel.SetArgument(8, 0); kernel.SetArgument(9, 0); } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/kernels/xgemm_direct.cpp000066400000000000000000000060471463263031500217300ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the direct xgemm kernels. // // ================================================================================================= #include "tuning/kernels/xgemm_direct.hpp" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Function to tune a specific variation V (not within the clblast namespace) template void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::Tuner(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings, clblast::XgemmDirectTestValidArguments, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectComputeLocalMemSize, clblast::XgemmDirectSetArguments); break; case clblast::Precision::kSingle: clblast::Tuner(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings, clblast::XgemmDirectTestValidArguments, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectComputeLocalMemSize, clblast::XgemmDirectSetArguments); break; case clblast::Precision::kDouble: clblast::Tuner(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings, clblast::XgemmDirectTestValidArguments, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectComputeLocalMemSize, clblast::XgemmDirectSetArguments); break; case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings, clblast::XgemmDirectTestValidArguments, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectComputeLocalMemSize, clblast::XgemmDirectSetArguments); break; case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings, clblast::XgemmDirectTestValidArguments, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectComputeLocalMemSize, clblast::XgemmDirectSetArguments); break; } } // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { StartVariation<1>(argc, argv); StartVariation<2>(argc, argv); return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/kernels/xgemm_direct.hpp000066400000000000000000000152371463263031500217360ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the direct xgemm kernels. There are two variations: // - V==1: This tests some limited set of tuning parameters exhaustively. // - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset. // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= // Settings for this kernel (default command-line arguments) TunerDefaults XgemmDirectGetTunerDefaults(const int V) { auto settings = TunerDefaults(); settings.options = {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, kArgHeuristicSelection, kArgPsoSwarmSize, kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; settings.default_m = 256; settings.default_n = 256; settings.default_k = 256; settings.default_fraction = (V==1) ? 1.0 : 64.0; // test all or sample randomly settings.default_num_runs = 4; return settings; } // Settings for this kernel (general) template TunerSettings XgemmDirectGetTunerSettings(const int V, const Arguments &args) { auto settings = TunerSettings(); // Identification of the kernel settings.kernel_family = (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; settings.kernel_name = "XgemmDirectTN"; settings.sources = #include "../src/kernels/level3/xgemm_direct_part1.opencl" #include "../src/kernels/level3/xgemm_direct_part2.opencl" #include "../src/kernels/level3/xgemm_direct_part3.opencl" ; // Buffer sizes settings.size_a = args.m * args.k; settings.size_b = args.n * args.k; settings.size_c = args.m * args.n; // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) settings.inputs = {2, 3, 4}; settings.outputs = {4}; // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; settings.local_size = {1, 1}; settings.local_size_ref = {8, 8}; // Transforms the thread configuration based on the parameters settings.mul_local = {{"MDIMCD", "NDIMCD"}}; settings.mul_global = {{"MDIMCD", "NDIMCD"}}; settings.div_global = {{"WGD", "WGD"}}; // Sets the tuning parameters and their possible values if (V==1) { // limited subset of tuning parameters - but explorable exhaustively settings.parameters = { {"WGD", {8, 16, 32}}, {"MDIMCD", {8, 16, 32}}, {"NDIMCD", {8, 16, 32}}, {"MDIMAD", {8, 16, 32}}, {"NDIMBD", {8, 16, 32}}, {"KWID", {2}}, {"VWMD", {1, 2, 4, 8}}, {"VWND", {1, 2, 4, 8}}, {"PADA", {1}}, {"PADB", {1}}, }; } else { // a lot more tuning parameters - has to be sampled randomly, too much to test all settings.parameters = { {"WGD", {8, 16, 32, 64}}, {"MDIMCD", {8, 16, 32}}, {"NDIMCD", {8, 16, 32}}, {"MDIMAD", {8, 16, 32}}, {"NDIMBD", {8, 16, 32}}, {"KWID", {2, 8, 16}}, {"VWMD", {1, 2, 4, 8}}, {"VWND", {1, 2, 4, 8}}, {"PADA", {0, 1}}, {"PADB", {0, 1}}, }; } // Describes how to compute the performance metrics settings.metric_amount = 2 * args.m * args.n * args.k; settings.performance_unit = "GFLOPS"; return settings; } // Tests for valid arguments template void XgemmDirectTestValidArguments(const int, const Arguments &) { } std::vector XgemmDirectSetConstraints(const int V) { auto constraints = std::vector(); auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; // Requirement for unrolling the WGD loop constraints.push_back({MultipleOfX, {"WGD", "KWID"}}); // Required for integer MWID and NWID constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}}); constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}}); // Required for integer MWIAD and NWIBD constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}}); constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}}); // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...) constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}}); constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}}); // Extra constraints for variation 1 to limit the set of options significantly if (V==1) { auto IsEqual = [] (std::vector v) { return v[0] == v[1]; }; constraints.push_back({IsEqual, {"MDIMCD", "MDIMAD"}}); constraints.push_back({IsEqual, {"NDIMCD", "NDIMBD"}}); } return constraints; } template LocalMemSizeInfo XgemmDirectComputeLocalMemSize(const int) { return { [] (std::vector v) -> size_t { return GetBytes(PrecisionValue()) * ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))); }, {"WGD", "PADA", "PADB"} }; } // Sets the kernel's arguments template void XgemmDirectSetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { kernel.SetArgument(0, static_cast(args.m)); kernel.SetArgument(1, static_cast(args.n)); kernel.SetArgument(2, static_cast(args.k)); kernel.SetArgument(3, GetRealArg(args.alpha)); kernel.SetArgument(4, GetRealArg(args.beta)); kernel.SetArgument(5, buffers[2]()); // 2 == A matrix kernel.SetArgument(6, 0); // a_offset kernel.SetArgument(7, static_cast(args.k)); // a_ld kernel.SetArgument(8, buffers[3]()); // 3 == B matrix kernel.SetArgument(9, 0); // b_offset kernel.SetArgument(10, static_cast(args.n)); // b_ld kernel.SetArgument(11, buffers[4]()); // 4 == C matrix kernel.SetArgument(12, 0); // c_offset kernel.SetArgument(13, static_cast(args.n)); // c_ld kernel.SetArgument(14, 1); // c_do_transpose kernel.SetArgument(15, 0); // a_conjugate kernel.SetArgument(16, 0); // b_conjugate } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/kernels/xgemv.cpp000066400000000000000000000056171463263031500204110ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the xgemv OpenCL kernels. // // ================================================================================================= #include "tuning/kernels/xgemv.hpp" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Function to tune a specific variation V (not within the clblast namespace) template void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::Tuner(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings, clblast::XgemvTestValidArguments, clblast::XgemvSetConstraints, clblast::XgemvComputeLocalMemSize, clblast::XgemvSetArguments); break; case clblast::Precision::kSingle: clblast::Tuner(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings, clblast::XgemvTestValidArguments, clblast::XgemvSetConstraints, clblast::XgemvComputeLocalMemSize, clblast::XgemvSetArguments); break; case clblast::Precision::kDouble: clblast::Tuner(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings, clblast::XgemvTestValidArguments, clblast::XgemvSetConstraints, clblast::XgemvComputeLocalMemSize, clblast::XgemvSetArguments); break; case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings, clblast::XgemvTestValidArguments, clblast::XgemvSetConstraints, clblast::XgemvComputeLocalMemSize, clblast::XgemvSetArguments); break; case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings, clblast::XgemvTestValidArguments, clblast::XgemvSetConstraints, clblast::XgemvComputeLocalMemSize, clblast::XgemvSetArguments); break; } } // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { StartVariation<1>(argc, argv); StartVariation<2>(argc, argv); StartVariation<3>(argc, argv); return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/kernels/xgemv.hpp000066400000000000000000000125331463263031500204110ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned: // 1: The full version of the kernel // 2: The fast version for non-transposed matrices // 3: The fast version for transposed matrices // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= // Settings for this kernel (default command-line arguments) TunerDefaults XgemvGetTunerDefaults(const int) { auto settings = TunerDefaults(); settings.options = {kArgM, kArgN, kArgAlpha, kArgBeta}; settings.default_m = 2048; settings.default_n = 2048; settings.default_num_runs = 4; return settings; } // Settings for this kernel (general) template TunerSettings XgemvGetTunerSettings(const int V, const Arguments &args) { auto settings = TunerSettings(); // Identification of the kernel settings.kernel_family = (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); settings.kernel_name = (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); settings.sources = #include "../src/kernels/level2/xgemv.opencl" #include "../src/kernels/level2/xgemv_fast.opencl" ; // Buffer sizes settings.size_x = args.n; settings.size_y = args.m; settings.size_a = args.m * args.n; // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) settings.inputs = {0, 1, 2}; settings.outputs = {1}; // Sets the base thread configuration settings.global_size = {args.m}; settings.global_size_ref = settings.global_size; settings.local_size = {1}; settings.local_size_ref = {64}; // Transforms the thread configuration based on the parameters settings.mul_local = {{"WGS"+std::to_string(V)}}; settings.div_global = (V==1 || V==2) ? TransformVector{{"WPT"+std::to_string(V)}} : TransformVector{}; // Sets the tuning parameters and their possible values if (V==1) { settings.parameters = { {"WGS"+std::to_string(V), {32, 64, 128, 256}}, {"WPT"+std::to_string(V), {1, 2, 4}}, }; } if (V==2) { settings.parameters = { {"WGS"+std::to_string(V), {16, 32, 64, 128, 256}}, {"WPT"+std::to_string(V), {1, 2, 4}}, {"VW"+std::to_string(V), {1, 2, 4, 8}}, }; } if (V==3) { settings.parameters = { {"WGS"+std::to_string(V), {16, 32, 64, 128}}, {"WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}}, {"VW"+std::to_string(V), {1, 2, 4, 8}}, }; } // Describes how to compute the performance metrics settings.metric_amount = (args.m*args.n + 2*args.m + args.n) * GetBytes(args.precision); settings.performance_unit = "GB/s"; return settings; } // Tests for valid arguments template void XgemvTestValidArguments(const int, const Arguments &) { } std::vector XgemvSetConstraints(const int V) { auto constraints = std::vector(); if (V==2 || V==3) { auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; constraints.push_back({MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}}); } if (V==3) { auto LargerOrEqual = [] (std::vector v) { return v[0] >= v[1]; }; constraints.push_back({LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}}); } return constraints; } template LocalMemSizeInfo XgemvComputeLocalMemSize(const int V) { if (V == 1 || V == 2) { return { [V] (std::vector v) -> size_t { return GetBytes(PrecisionValue()) * v[0]; }, {"WGS" + std::to_string(V)} }; } return { [V] (std::vector v) -> size_t { return GetBytes(PrecisionValue()) * (v[0] + v[1] * v[2]); }, {"WGS3", "WPT3", "WGS3"} }; } // Sets the kernel's arguments template void XgemvSetArguments(const int V, Kernel &kernel, const Arguments &args, std::vector>& buffers) { auto a_rotated = (V==3) ? 1 : 0; kernel.SetArgument(0, static_cast(args.m)); kernel.SetArgument(1, static_cast(args.n)); kernel.SetArgument(2, GetRealArg(args.alpha)); kernel.SetArgument(3, GetRealArg(args.beta)); kernel.SetArgument(4, a_rotated); kernel.SetArgument(5, buffers[2]()); // 2 == A matrix kernel.SetArgument(6, 0); kernel.SetArgument(7, static_cast(args.m)); kernel.SetArgument(8, buffers[0]()); // 0 == X vector kernel.SetArgument(9, 0); kernel.SetArgument(10, 1); kernel.SetArgument(11, buffers[1]()); // 1 == Y vector kernel.SetArgument(12, 0); kernel.SetArgument(13, 1); kernel.SetArgument(14, 0); // Conjugate transpose kernel.SetArgument(15, 0); // Additional parameter kernel.SetArgument(16, 0); // Banded 'kl' kernel.SetArgument(17, 0); // Banded 'ku' } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/kernels/xger.cpp000066400000000000000000000052061463263031500202220ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the xger OpenCL kernels. // // ================================================================================================= #include "tuning/kernels/xger.hpp" // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings, clblast::XgerTestValidArguments, clblast::XgerSetConstraints, clblast::XgerComputeLocalMemSize, clblast::XgerSetArguments); break; case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings, clblast::XgerTestValidArguments, clblast::XgerSetConstraints, clblast::XgerComputeLocalMemSize, clblast::XgerSetArguments); break; case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings, clblast::XgerTestValidArguments, clblast::XgerSetConstraints, clblast::XgerComputeLocalMemSize, clblast::XgerSetArguments); break; case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings, clblast::XgerTestValidArguments, clblast::XgerSetConstraints, clblast::XgerComputeLocalMemSize, clblast::XgerSetArguments); break; case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings, clblast::XgerTestValidArguments, clblast::XgerSetConstraints, clblast::XgerComputeLocalMemSize, clblast::XgerSetArguments); break; } return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/kernels/xger.hpp000066400000000000000000000072051463263031500202300ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file uses the auto-tuner to tune the xger OpenCL kernels. // // ================================================================================================= #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= // Settings for this kernel (default command-line arguments) TunerDefaults XgerGetTunerDefaults(const int) { auto settings = TunerDefaults(); settings.options = {kArgM, kArgN, kArgAlpha}; settings.default_m = 1024; settings.default_n = 1024; return settings; } // Settings for this kernel (general) template TunerSettings XgerGetTunerSettings(const int, const Arguments &args) { auto settings = TunerSettings(); // Identification of the kernel settings.kernel_family = "xger"; settings.kernel_name = "Xger"; settings.sources = #include "../src/kernels/level2/level2.opencl" #include "../src/kernels/level2/xger.opencl" ; // Buffer sizes settings.size_x = args.m; settings.size_y = args.n; settings.size_a = args.m * args.n; // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) settings.inputs = {0, 1, 2}; settings.outputs = {2}; // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; settings.local_size = {1, 1}; settings.local_size_ref = {8, 8}; // Transforms the thread configuration based on the parameters settings.mul_local = {{"WGS1", "WGS2"}}; settings.div_global = {{"WPT", "WPT"}}; // Sets the tuning parameters and their possible values settings.parameters = { {"WGS1", {4, 8, 16, 32, 64, 128, 256, 512}}, {"WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256}}, {"WPT", {1, 2, 4}}, }; // Describes how to compute the performance metrics settings.metric_amount = (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision); settings.performance_unit = "GB/s"; return settings; } // Tests for valid arguments template void XgerTestValidArguments(const int, const Arguments &) { } std::vector XgerSetConstraints(const int) { return {}; } template LocalMemSizeInfo XgerComputeLocalMemSize(const int) { return { [] (std::vector) -> size_t { return 0; }, {} }; } // Sets the kernel's arguments template void XgerSetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { kernel.SetArgument(0, static_cast(args.m)); kernel.SetArgument(1, static_cast(args.n)); kernel.SetArgument(2, GetRealArg(args.alpha)); kernel.SetArgument(3, buffers[0]()); // 0 == X vector kernel.SetArgument(4, 0); // x_offset kernel.SetArgument(5, 1); // x_increment kernel.SetArgument(6, buffers[1]()); // 1 == Y vector kernel.SetArgument(7, 0); // y_offset kernel.SetArgument(8, 1); // y_increment kernel.SetArgument(9, buffers[2]()); // 2 == A matrix kernel.SetArgument(10, 0); // a_offset kernel.SetArgument(11, static_cast(args.m)); // a_ld kernel.SetArgument(12, 0); // a_is_rowmajor } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/routines/000077500000000000000000000000001463263031500167535ustar00rootroot00000000000000CLBlast-1.6.3/src/tuning/routines/routine_tuner.hpp000066400000000000000000000144141463263031500223720ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the part of the auto-tuner for tuning entire routines (i.e. switching // between direct and in-direct GEMM kernels) // // ================================================================================================= #ifndef CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_ #define CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_ #include #include #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= template void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device, const std::string &tuner_name, const std::string& parameter_name) { const auto override_status = OverrideParameters(device(), tuner_name, PrecisionValue(), {{parameter_name, minimum_size}}); if (override_status != StatusCode::kSuccess) { throw RuntimeError("OverrideParameters failed with status " + ToString(override_status)); } } // Computes the best switching point TuningResult GetBestResult(const std::vector& scores) { auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison); return *best_configuration; } // Tunes at kernel-level template void TuneKernelSelection(const Platform& platform, const Device& device, const Context& context, Queue& queue, const Precision precision, F const &routine, const size_t from, const size_t to, const size_t step, const size_t batch_count, const size_t num_runs, const std::string &name, const std::string &tuner_name, const std::string &family_name, const std::string& parameter_name) { // Buffers auto buffers = std::vector>{ Buffer(context, to * to * batch_count), Buffer(context, to * to * batch_count), Buffer(context, to * to * batch_count) }; // In-direct version printf("\n* Testing the in-direct %s routine for m=n=k\n", name.c_str()); ForceSelectIndirectFrom(0, device, tuner_name, parameter_name); const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, routine); // Direct version printf("\n* Testing the direct %s routine for m=n=k\n", name.c_str()); ForceSelectIndirectFrom(batch_count * to + 1, device, tuner_name, parameter_name); const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, routine); // Determining final score and best kernel selection point assert(indirect.size() == direct.size()); printf("\n* Collecting results\n"); auto ratios = std::vector(indirect.size()); for (auto i = size_t{0}; i < indirect.size(); ++i) { ratios[i] = indirect[i].second / direct[i].second; } auto scores = std::vector(ratios.size()); for (auto i = size_t{0}; i < scores.size(); ++i) { auto score = 0; for (auto j = size_t{0}; j < i; ++j) { score += (ratios[j] <= 1.0); } for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); } const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones const auto relative_score = static_cast(score) / static_cast(scores.size() - 1); auto tuning_results = Configuration(); tuning_results[parameter_name] = indirect[i].first; tuning_results["PRECISION"] = static_cast(precision); scores[i] = TuningResult{ name + "_kernel_selection", (relative_score * relative_score) * 100 + epsilon, // squared for proper default computation tuning_results }; } // Displaying results printf("| || %12s indirect || %12s direct || |\n", name.c_str(), name.c_str()); printf("| m=n=k || ms | GFLOPS || ms | GFLOPS || score | (lowest score == best switching point)\n"); printf("x---------xx----------x------------xx----------x----------xx----------x\n"); for (auto i = size_t{0}; i < indirect.size(); ++i) { assert(indirect[i].first == direct[i].first); const auto value = indirect[i].first; if (indirect[i].second != -1 && direct[i].second != -1) { const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6); const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6); printf("| %7zu || %8.2lf | %10.1lf || %8.2lf | %8.1lf || %8.3lf |\n", value, indirect[i].second, gflops_indirect, direct[i].second, gflops_direct, scores[i].score); } } printf("x---------xx----------x------------xx----------x----------xx----------x\n"); printf("\n"); const auto best_result = GetBestResult(scores); const auto best_switching_point = best_result.config.at(parameter_name); const auto best_string = parameter_name + "=" + ToString(best_switching_point); // Outputs the results as JSON to disk, including some meta-data const auto precision_string = std::to_string(static_cast(precision)); auto metadata = std::vector>{ {"kernel_family", family_name}, {"precision", precision_string}, {"arg_from", ToString(from)}, {"arg_to", ToString(to)}, {"arg_step", ToString(step)}, {"best_kernel", best_result.name}, {"best_time", ToString(best_result.score)}, {"best_parameters", best_string} }; PrintTimingsToFileAsJSON("clblast_" + family_name + "_" + precision_string + ".json", device, platform, metadata, scores); } // ================================================================================================= } // namespace clblast // CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_ #endif CLBlast-1.6.3/src/tuning/routines/xgemm.cpp000066400000000000000000000250421463263031500205770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file tunes the Xgemm routine at a high-level: choosing between the direct (single-kernel) // and the in-direct (kernel plus pre/post-processing) methods. // // ================================================================================================= #include #include #include #include #include "utilities/utilities.hpp" #include "test/test_utilities.hpp" #include "tuning/routines/routine_tuner.hpp" namespace clblast { // ================================================================================================= template void RunGemmRoutineMNK(const size_t m, const size_t n, const size_t k, const Queue& queue, const std::vector>& buffers) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemm(Layout::kRowMajor, Transpose::kNo, Transpose::kNo, m, n, k, ConstantOne(), buffers[0](), 0, k, buffers[1](), 0, n, ConstantOne(), buffers[2](), 0, n, &queue_plain, &event); if (status != StatusCode::kSuccess) { throw RuntimeError("Gemm failed with status " + ToString(status)); } clWaitForEvents(1, &event); clReleaseEvent(event); } template void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector>& buffers) { RunGemmRoutineMNK(value, value, value, queue, buffers); } template void RunGemmBatchedRoutine(const size_t value, const Queue& queue, const std::vector>& buffers) { auto offsets = std::vector(batch_count); auto factors = std::vector(batch_count); for (auto i = size_t{0}; i < batch_count; ++i) { offsets[i] = batch_count * value; factors[i] = ConstantOne(); } auto queue_plain = queue(); auto event = cl_event{}; auto status = GemmBatched(Layout::kRowMajor, Transpose::kNo, Transpose::kNo, value, value, value, factors.data(), buffers[0](), offsets.data(), value, buffers[1](), offsets.data(), value, factors.data(), buffers[2](), offsets.data(), value, batch_count, &queue_plain, &event); if (status != StatusCode::kSuccess) { throw RuntimeError("GemmBatched failed with status " + ToString(status)); } clWaitForEvents(1, &event); clReleaseEvent(event); } template void RunGemmStridedBatchedRoutine(const size_t value, const Queue& queue, const std::vector>& buffers) { auto queue_plain = queue(); auto event = cl_event{}; auto status = GemmStridedBatched(Layout::kRowMajor, Transpose::kNo, Transpose::kNo, value, value, value, ConstantOne(), buffers[0](), 0, value, value * value, buffers[1](), 0, value, value * value, ConstantOne(), buffers[2](), 0, value, value * value, batch_count, &queue_plain, &event); if (status != StatusCode::kSuccess) { throw RuntimeError("Gemm failed with status " + ToString(status)); } clWaitForEvents(1, &event); clReleaseEvent(event); } // ================================================================================================= template void TuneGemmSingleSize(const Platform& platform, const Device& device, const Context& context, Queue& queue, const size_t m, const size_t n, const size_t k, const size_t num_runs) { // Buffers auto buffers = std::vector>{ Buffer(context, m * k), Buffer(context, k * n), Buffer(context, m * n) }; const auto FunctionToTune = [&]() { RunGemmRoutineMNK(m, n, k, queue, buffers); }; // Collects the timings for two methods auto scores = std::vector(); const auto methods = std::vector{"in-direct", "direct"}; for (auto& method: methods) { printf("* Testing the %s routine\n", method.c_str()); const auto limit = (method == "in-direct") ? 0 : std::max(std::max(m, n), k) + 1; // small or large number ForceSelectIndirectFrom(limit, device, "GemmRoutine", "XGEMM_MIN_INDIRECT_SIZE"); auto time_ms = -1.0; try { time_ms = TimeFunction(num_runs, FunctionToTune); printf(" --> %9.2lf ms\n", time_ms); } catch (...) { const auto status_code = DispatchExceptionCatchAll(true); printf(" --> error %-5d\n", static_cast(status_code)); } auto tuning_results = Configuration(); tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = limit; tuning_results["PRECISION"] = static_cast(PrecisionValue()); scores.push_back(TuningResult{"gemm_kernel_selection_single_size", time_ms, tuning_results}); } // Outputs the results as JSON to disk, including some meta-data const auto precision_string = std::to_string(static_cast(PrecisionValue())); auto metadata = std::vector>{ {"kernel_family", "gemm_routine_single_size"}, {"precision", precision_string}, {"arg_m", ToString(m)}, {"arg_n", ToString(n)}, {"arg_k", ToString(k)}, }; PrintTimingsToFileAsJSON("clblast_gemm_routine_single_size_" + precision_string + ".json", device, platform, metadata, scores); } // ================================================================================================= template void TuneXgemm(int argc, char* argv[]) { auto command_line_args = RetrieveCommandLineArguments(argc, argv); auto help = std::string{"* Options given/available:\n"}; const auto platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); const auto device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); const auto precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle); const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, size_t{10}); const auto arg_m = GetArgument(command_line_args, help, kArgM, -1); // optional const auto arg_n = GetArgument(command_line_args, help, kArgN, -1); // optional const auto arg_k = GetArgument(command_line_args, help, kArgK, -1); // optional fprintf(stdout, "%s\n", help.c_str()); // OpenCL initialisation const auto platform = Platform(platform_id); const auto device = Device(platform, device_id); if (!PrecisionSupported(device)) { printf("* Unsupported precision, skipping this tuning run\n"); return; } const auto context = Context(device); auto queue = Queue(context, device); // Pre-load GEMM kernel tuning results if they exist printf("* The GEMM routine tuner requires already tuned kernels\n"); printf(" Applying tuning results from disk if they exist...\n\n"); const auto kernel_names = {"xgemm_1", "xgemm_direct_1", "copy", "pad", "transpose", "padtranspose"}; for (const auto& kernel_name : kernel_names) { const auto tuner_file_name = "clblast_" + std::string{kernel_name} + "_" + ToString(static_cast(precision)) + ".json"; printf("* Looking for tuning results in the current folder: '%s'\n", tuner_file_name.c_str()); if (std::ifstream(tuner_file_name)) { // Checks if the file exists on disk OverrideParametersFromJSONFiles({tuner_file_name}, device(), precision); } else { printf(" Not found: assuming the kernel '%s' is already tuned\n\n", kernel_name); } } // Test for only one m/n/k size if (arg_m != -1 || arg_n != -1 || arg_k != -1) { printf("* Tuning for one specific size: m=%d, n=%d, k=%d\n", arg_m, arg_n, arg_k); if (arg_m == -1 || arg_n == -1 || arg_k == -1) { printf("* Error: If one of m/n/k specified, please specify all three\n"); return; } TuneGemmSingleSize(platform, device, context, queue, static_cast(arg_m), static_cast(arg_n), static_cast(arg_k), num_runs); } else { // Run the tuners for the XGEMM routines TuneKernelSelection(platform, device, context, queue, precision, RunGemmRoutine, 64, 2048, 64, 1, num_runs, "gemm", "GemmRoutine", "gemm_routine", "XGEMM_MIN_INDIRECT_SIZE"); //TuneKernelSelection(platform, device, context, queue, precision, RunGemmBatchedRoutine, // 16, 128, 32, 30, num_runs, // "gemmbatched", "GemmRoutine", "gemm_routine_2", "XGEMMBATCHED_MIN_INDIRECT_SIZE"); //TuneKernelSelection(platform, device, context, queue, precision, RunGemmStridedBatchedRoutine, // 16, 128, 32, 30, num_runs, // "gemmstridedbatched", "GemmRoutine", "gemm_routine_3", "XGEMMSTRIDEDBATCHED_MIN_INDIRECT_SIZE"); } printf("* Completed tuning process\n"); printf("\n"); } // ================================================================================================= } // namespace clblast // Shortcuts to the clblast namespace using half = clblast::half; using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kHalf: clblast::TuneXgemm(argc, argv); break; case clblast::Precision::kSingle: clblast::TuneXgemm(argc, argv); break; case clblast::Precision::kDouble: clblast::TuneXgemm(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::TuneXgemm(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::TuneXgemm(argc, argv); break; } return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/routines/xtrsv.cpp000066400000000000000000000133661463263031500206560ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file tunes the Xtrsv routine at a high-level: choosing an appropriate block size // // ================================================================================================= #include #include #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= constexpr auto size = size_t{1024}; // 'n' argument template void SetBlockSize(const size_t value, const Device &device) { const auto override_status = OverrideParameters(device(), "TrsvRoutine", PrecisionValue(), {{"TRSV_BLOCK_SIZE", value}}); if (override_status != StatusCode::kSuccess) { throw RuntimeError("OverrideParameters failed with status " + ToString(override_status)); } } template void RunTrsvRoutine(const size_t block_size, Queue& queue, const std::vector>& buffers) { SetBlockSize(block_size, queue.GetDevice()); auto queue_plain = queue(); auto event = cl_event{}; auto status = Trsv(Layout::kRowMajor, Triangle::kLower, Transpose::kNo, Diagonal::kNonUnit, size, buffers[0](), 0, size, // A matrix buffers[1](), 0, 1, // X vector &queue_plain, &event); if (status != StatusCode::kSuccess) { throw RuntimeError("Trsv failed with status " + ToString(status)); } clWaitForEvents(1, &event); clReleaseEvent(event); } template void TuneXtrsv(int argc, char* argv[]) { auto command_line_args = RetrieveCommandLineArguments(argc, argv); auto help = std::string{"* Options given/available:\n"}; const auto platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); const auto device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); const auto precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle); const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, size_t{10}); fprintf(stdout, "%s\n", help.c_str()); // Values for the block size const auto from = size_t{8}; const auto to = size_t{32 + 1}; const auto step = size_t{8}; // OpenCL initialisation const auto platform = Platform(platform_id); const auto device = Device(platform, device_id); if (!PrecisionSupported(device)) { printf("* Unsupported precision, skipping this tuning run\n"); return; } const auto context = Context(device); auto queue = Queue(context, device); // Buffers auto buffers = std::vector>{ Buffer(context, size * size), Buffer(context, size) }; // Performance testing const auto results = TimeRoutine(from, to, step, num_runs, queue, buffers, RunTrsvRoutine); // Stores the results in the expected format auto scores = std::vector(); for (const auto &result : results) { if (result.second != -1) { auto tuning_results = Configuration(); tuning_results["TRSV_BLOCK_SIZE"] = result.first; tuning_results["PRECISION"] = static_cast(precision); scores.emplace_back(TuningResult{"trsv_routine", result.second, tuning_results}); } } // Computes the best result auto best_time = std::numeric_limits::max(); auto best_value = size_t{0}; for (const auto &result : results) { if (result.second != -1 && result.second < best_time) { best_time = result.second; best_value = result.first; } } const auto best_string = "TRSV_BLOCK_SIZE=" + ToString(best_value); // Outputs the results as JSON to disk, including some meta-data const auto precision_string = std::to_string(static_cast(precision)); auto metadata = std::vector>{ {"kernel_family", "trsv_routine"}, {"precision", precision_string}, {"arg_n", ToString(size)}, {"best_kernel", "trsv_routine"}, {"best_time", ToString(best_time)}, {"best_parameters", best_string} }; PrintTimingsToFileAsJSON("clblast_routine_xtrsv_" + precision_string + ".json", device, platform, metadata, scores); printf("* Completed tuning process\n"); printf("\n"); } // ================================================================================================= } // namespace clblast // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { try { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { case clblast::Precision::kSingle: clblast::TuneXtrsv(argc, argv); break; case clblast::Precision::kDouble: clblast::TuneXtrsv(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::TuneXtrsv(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::TuneXtrsv(argc, argv); break; } return 0; } catch (...) { return static_cast(clblast::DispatchException()); } } // ================================================================================================= CLBlast-1.6.3/src/tuning/tuning.cpp000066400000000000000000000532521463263031500171220ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the generic CLBlast auto-tuner (inspired by CLTune). This is only used for // the optional and stand-alone tuner binaries and not part of the core of CLBlast. // // ================================================================================================= #include #include #include #include #include #include #include "utilities/utilities.hpp" #include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= void PrintTimingsToFileAsJSON(const std::string &filename, const Device& device, const Platform& platform, const std::vector> &metadata, const std::vector& tuning_results) { auto num_results = tuning_results.size(); printf("* Writing a total of %zu results to '%s'\n", num_results, filename.c_str()); auto file = fopen(filename.c_str(), "w"); fprintf(file, "{\n"); for (auto &datum: metadata) { fprintf(file, " \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str()); } fprintf(file, " \"clblast_device_type\": \"%s\",\n", GetDeviceType(device).c_str()); fprintf(file, " \"clblast_device_vendor\": \"%s\",\n", GetDeviceVendor(device).c_str()); fprintf(file, " \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str()); fprintf(file, " \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str()); fprintf(file, " \"device\": \"%s\",\n", device.Name().c_str()); fprintf(file, " \"platform_vendor\": \"%s\",\n", platform.Vendor().c_str()); fprintf(file, " \"platform_version\": \"%s\",\n", platform.Version().c_str()); fprintf(file, " \"device_vendor\": \"%s\",\n", device.Vendor().c_str()); fprintf(file, " \"device_type\": \"%s\",\n", device.Type().c_str()); fprintf(file, " \"device_core_clock\": \"%zu\",\n", device.CoreClock()); fprintf(file, " \"device_compute_units\": \"%zu\",\n", device.ComputeUnits()); fprintf(file, " \"device_extra_info\": \"%s\",\n", device.GetExtraInfo().c_str()); fprintf(file, " \"results\": [\n"); // Loops over all results for (auto r = size_t{0}; r < num_results; ++r) { auto result = tuning_results[r]; fprintf(file, " {\n"); fprintf(file, " \"kernel\": \"%s\",\n", result.name.c_str()); fprintf(file, " \"time\": %.3lf,\n", result.score); // Loops over all the parameters for this result fprintf(file, " \"parameters\": {"); auto num_configs = result.config.size(); auto p = size_t{0}; for (const auto& parameter : result.config) { fprintf(file, "\"%s\": %zu", parameter.first.c_str(), parameter.second); if (p < num_configs -1 ) { fprintf(file, ","); } ++p; } fprintf(file, "}\n"); // The footer fprintf(file, " }"); if (r < num_results - 1) { fprintf(file, ","); } fprintf(file, "\n"); } fprintf(file, " ]\n"); fprintf(file, "}\n"); fclose(file); } void print_separator(const size_t parameters_size) { printf("x------x-------x"); for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); } printf("-x-----------------x-----------------x----------------x--------------x--------x-------------------x\n"); } // ================================================================================================= template void Tuner(int argc, char* argv[], const int V, GetTunerDefaultsFunc GetTunerDefaults, GetTunerSettingsFunc GetTunerSettings, TestValidArgumentsFunc TestValidArguments, SetConstraintsFunc SetConstraints, ComputeLocalMemSizeFunc ComputeLocalMemSize, SetArgumentsFunc SetArguments) { constexpr auto kSeed = 42; // fixed seed for reproducibility // Constants holding start and end strings for terminal-output in colour #if defined(_WIN32) const std::string kPrintError = ""; const std::string kPrintSuccess = ""; const std::string kPrintMessage = ""; const std::string kPrintEnd = ""; #else const std::string kPrintError = "\x1b[31m"; const std::string kPrintSuccess = "\x1b[32m"; const std::string kPrintMessage = "\x1b[1m"; const std::string kPrintEnd = "\x1b[0m"; #endif // Sets the parameters and platform/device for which to tune (command-line options) const TunerDefaults defaults = GetTunerDefaults(V); auto command_line_args = RetrieveCommandLineArguments(argc, argv); auto help = std::string{"* Options given/available:\n"}; auto args = Arguments{}; args.platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); args.device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); args.precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle); for (auto &o: defaults.options) { if (o == kArgM) { args.m = GetArgument(command_line_args, help, kArgM, defaults.default_m); } if (o == kArgN) { args.n = GetArgument(command_line_args, help, kArgN, defaults.default_n); } if (o == kArgK) { args.k = GetArgument(command_line_args, help, kArgK, defaults.default_k); } if (o == kArgChannels) { args.channels = GetArgument(command_line_args, help, kArgChannels, defaults.channels); } if (o == kArgHeight) { args.height = GetArgument(command_line_args, help, kArgHeight, defaults.height); } if (o == kArgWidth) { args.width = GetArgument(command_line_args, help, kArgWidth, defaults.width); } if (o == kArgKernelH) { args.kernel_h = GetArgument(command_line_args, help, kArgKernelH, defaults.kernel_h); } if (o == kArgKernelW) { args.kernel_w = GetArgument(command_line_args, help, kArgKernelW, defaults.kernel_w); } if (o == kArgNumKernels) { args.num_kernels = GetArgument(command_line_args, help, kArgNumKernels, defaults.num_kernels); } if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar()); } if (o == kArgBeta) { args.beta = GetArgument(command_line_args, help, kArgBeta, GetScalar()); } if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, defaults.default_batch_count); } } args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction); args.num_runs = GetArgument(command_line_args, help, kArgNumRuns, defaults.default_num_runs); const auto max_l2_norm = GetArgument(command_line_args, help, kArgMaxL2Norm, 1.0e-4); printf("%s\n", help.c_str()); const TunerSettings settings = GetTunerSettings(V, args); // Tests validity of the given arguments TestValidArguments(V, args); // Initializes OpenCL const auto platform = Platform(args.platform_id); const auto device = Device(platform, args.device_id); const auto context = Context(device); auto queue = Queue(context, device); // Tests for validity of the precision and retrieves properties if (!PrecisionSupported(device)) { printf("* Unsupported precision, skipping this tuning run\n\n"); return; } const auto device_type = GetDeviceType(device); const auto device_vendor = GetDeviceVendor(device); const auto device_architecture = GetDeviceArchitecture(device); const auto device_name = GetDeviceName(device); // Creates input buffers with random data. Adds a 'canary' region to detect buffer overflows. const auto buffer_sizes = std::vector{ settings.size_x + kCanarySize, settings.size_y + kCanarySize, settings.size_a + kCanarySize, settings.size_b + kCanarySize, settings.size_c + kCanarySize, settings.size_temp + kCanarySize }; std::mt19937 mt(kSeed); std::uniform_real_distribution dist(kTestDataLowerLimit, kTestDataUpperLimit); auto source_buffers = std::vector>(); auto reference_buffers = std::vector>(); auto result_buffers = std::vector>(); auto device_buffers = std::vector>(); for (const auto size : buffer_sizes) { auto host_buffer = std::vector(size); PopulateVector(host_buffer, mt, dist); source_buffers.push_back(host_buffer); reference_buffers.push_back(std::vector(size)); result_buffers.push_back(std::vector(size)); device_buffers.push_back(Buffer(context, size)); } // Sets the tunable parameters and their possible values auto configurations = SetConfigurations(device, settings.parameters, settings.local_size, settings.mul_local, settings.div_local, SetConstraints(V), ComputeLocalMemSize(V)); printf("* Found %s%zu configuration(s)%s\n", kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str()); // Select the search method (full search or a random fraction) if (args.fraction != 0.0 && args.fraction != 1.0) { const auto new_size = static_cast(configurations.size() / args.fraction); auto rng = std::default_random_engine{}; std::shuffle(std::begin(configurations), std::end(configurations), rng); configurations.resize(new_size); printf("* Exploring a random subset of %s%zu configuration(s)%s\n", kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str()); } // Prints information about the parameters printf("* Parameters explored: "); for (const auto& parameter : settings.parameters) { printf("%s ", parameter.first.c_str()); } printf("\n"); // Prints the header of the table printf("\n"); printf("| ID | total |"); for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); } printf("param | local | global | compiles | time | %6s | status |\n", settings.performance_unit.c_str()); print_separator(settings.parameters.size()); // First runs a reference example to compare against try { printf("| ref | - |"); for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); } printf(" - |"); // Sets the input for (const auto id : settings.inputs) { device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); } // Sets the thread configuration auto global = settings.global_size_ref; auto local = settings.local_size_ref; // Make sure that the global worksize is a multiple of the local for (auto i=size_t{0}; i 1 && global.size() > 1) { printf("%8zu%8zu |%8zu%8zu |", local[0], local[1], global[0], global[1]); } else { printf("%8zu%8d |%8zu%8d |", local[0], 1, global[0], 1); } // Compiles the kernel auto compiler_options = std::vector(); const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name, device, context, compiler_options, 0); auto kernel = Kernel(program, settings.kernel_name); SetArguments(V, kernel, args, device_buffers); printf(" %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str()); // Runs the kernel const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local); printf(" - |"); if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); } // Saves the result for (const auto id : settings.outputs) { device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]); } printf(" %sreference OK%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str()); } catch (...) { const auto status_code = DispatchExceptionCatchAll(true); printf("* Exception caught with status %d while running the reference, aborting\n", static_cast(status_code)); return; } print_separator(settings.parameters.size()); // Starts the tuning process auto results = std::vector(); for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) { try { auto configuration = configurations[config_id]; printf("| %4zu | %5zu |", config_id + 1, configurations.size()); for (const auto& parameter : settings.parameters) { printf("%5zu", configuration.at(parameter.first)); } printf(" |"); // Sets the input for (const auto id : settings.inputs) { device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); } // Sets the thread configuration auto global = SetThreadConfiguration(configuration, settings.global_size, settings.mul_global, settings.div_global); auto local = SetThreadConfiguration(configuration, settings.local_size, settings.mul_local, settings.div_local); // Make sure that the global worksize is a multiple of the local for (auto i=size_t{0}; i 1 && global.size() > 1) { printf("%8zu%8zu |%8zu%8zu |", local[0], local[1], global[0], global[1]); } else { printf("%8zu%8d |%8zu%8d |", local[0], 1, global[0], 1); } // Sets the parameters for this configuration auto kernel_source = std::string{""}; for (const auto ¶meter : configuration) { kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n"; } kernel_source += settings.sources; // Compiles the kernel const auto start_time = std::chrono::steady_clock::now(); auto compiler_options = std::vector(); const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name, device, context, compiler_options, 0, true); auto kernel = Kernel(program, settings.kernel_name); const auto elapsed_time = std::chrono::steady_clock::now() - start_time; const auto timing = std::chrono::duration(elapsed_time).count(); printf(" %sOK%s %5.0lf ms |", kPrintSuccess.c_str(), kPrintEnd.c_str(), timing); // Runs the kernel SetArguments(V, kernel, args, device_buffers); const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local); // Kernel run was not successful if (time_ms == -1.0) { printf(" - |"); printf(" %sinvalid config.%s |", kPrintError.c_str(), kPrintEnd.c_str()); printf(" <-- skipping\n"); continue; } // Compares the results auto l2_error = 0.0; for (const auto id : settings.outputs) { device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]); for (auto index = size_t{0}; index(buffer_sizes[id]); if (std::isnan(l2_error) || l2_error > max_l2_norm) { printf(" - |"); printf(" %sL2 error %8.2e%s |", kPrintError.c_str(), l2_error, kPrintEnd.c_str()); throw std::runtime_error("L2 error too large"); } } // All was OK configuration["PRECISION"] = static_cast(args.precision); results.push_back(TuningResult{settings.kernel_name, time_ms, configuration}); printf(" %6.1lf |", settings.metric_amount / (time_ms * 1.0e6)); printf(" %sresults match%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str()); } catch (CLCudaAPIBuildError&) { const auto status_code = DispatchExceptionCatchAll(true); printf(" %scompilation error: %5d%s |", kPrintError.c_str(), static_cast(status_code), kPrintEnd.c_str()); printf(" - | - | <-- skipping\n"); } catch (...) { const auto status_code = DispatchExceptionCatchAll(true); if (status_code != StatusCode::kUnknownError) { printf(" %serror code %d%s |", kPrintError.c_str(), static_cast(status_code), kPrintEnd.c_str()); } printf(" <-- skipping\n"); } } // Completed the tuning process print_separator(settings.parameters.size()); printf("\n"); if (results.size() == 0) { return; } // Computes the best results auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; const auto best_configuration = std::min_element(results.begin(), results.end(), comparison); const auto best_time_ms = best_configuration->score; if (best_time_ms == 0.0) { return; } // Computes and prints some other statistics auto average_ms = 0.0; for (const auto& result : results) { average_ms += result.score; } average_ms /= results.size(); printf("\n"); printf("* Got average result of %.2lf ms", average_ms); printf(": %.1lf %s\n", settings.metric_amount / (average_ms * 1.0e6), settings.performance_unit.c_str()); // Also prints the performance of the best-case in terms of GB/s or GFLOPS printf("* Found best result %.2lf ms", best_time_ms); printf(": %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6), settings.performance_unit.c_str()); printf("* Best parameters: "); auto best_string = std::string{""}; auto i = size_t{0}; for (const auto& config : best_configuration->config) { best_string += "" + config.first + "=" + ToString(config.second); if (i < best_configuration->config.size() - 1) { best_string += " "; } ++i; } printf("%s\n\n", best_string.c_str()); // Outputs the results as JSON to disk, including some meta-data auto precision_string = std::to_string(static_cast(args.precision)); auto metadata = std::vector>{ {"kernel_family", settings.kernel_family}, {"precision", precision_string}, {"best_kernel", best_configuration->name}, {"best_time", ToString(best_configuration->score)}, {"best_parameters", best_string} }; for (auto &o: defaults.options) { if (o == kArgM) { metadata.push_back({"arg_m", ToString(args.m)}); } if (o == kArgN) { metadata.push_back({"arg_n", ToString(args.n)}); } if (o == kArgK) { metadata.push_back({"arg_k", ToString(args.k)}); } if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); } if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); } if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); } if (o == kArgHeight) { metadata.push_back({"arg_height", ToString(args.height)}); } if (o == kArgWidth) { metadata.push_back({"arg_width", ToString(args.width)}); } if (o == kArgKernelH) { metadata.push_back({"arg_kernel_h", ToString(args.kernel_h)}); } if (o == kArgKernelW) { metadata.push_back({"arg_kernel_w", ToString(args.kernel_w)}); } if (o == kArgChannels) { metadata.push_back({"arg_channels", ToString(args.channels)}); } if (o == kArgNumKernels) { metadata.push_back({"arg_num_kernels", ToString(args.num_kernels)}); } } PrintTimingsToFileAsJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json", device, platform, metadata, results); printf("* Completed tuning process\n"); printf("\n"); } // Compiles the above function template void Tuner(int argc, char* argv[], const int V, GetTunerDefaultsFunc GetTunerDefaults, GetTunerSettingsFunc GetTunerSettings, TestValidArgumentsFunc TestValidArguments, SetConstraintsFunc SetConstraints, ComputeLocalMemSizeFunc ComputeLocalMemSize, SetArgumentsFunc SetArguments); template void Tuner(int argc, char* argv[], const int V, GetTunerDefaultsFunc GetTunerDefaults, GetTunerSettingsFunc GetTunerSettings, TestValidArgumentsFunc TestValidArguments, SetConstraintsFunc SetConstraints, ComputeLocalMemSizeFunc ComputeLocalMemSize, SetArgumentsFunc SetArguments); template void Tuner(int argc, char* argv[], const int V, GetTunerDefaultsFunc GetTunerDefaults, GetTunerSettingsFunc GetTunerSettings, TestValidArgumentsFunc TestValidArguments, SetConstraintsFunc SetConstraints, ComputeLocalMemSizeFunc ComputeLocalMemSize, SetArgumentsFunc SetArguments); template void Tuner(int argc, char* argv[], const int V, GetTunerDefaultsFunc GetTunerDefaults, GetTunerSettingsFunc GetTunerSettings, TestValidArgumentsFunc TestValidArguments, SetConstraintsFunc SetConstraints, ComputeLocalMemSizeFunc ComputeLocalMemSize, SetArgumentsFunc SetArguments); template void Tuner(int argc, char* argv[], const int V, GetTunerDefaultsFunc GetTunerDefaults, GetTunerSettingsFunc GetTunerSettings, TestValidArgumentsFunc TestValidArguments, SetConstraintsFunc SetConstraints, ComputeLocalMemSizeFunc ComputeLocalMemSize, SetArgumentsFunc SetArguments); // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/tuning/tuning.hpp000066400000000000000000000130611463263031500171210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the generic CLBlast auto-tuner (inspired by CLTune). This is only used for // the optional and stand-alone tuner binaries and not part of the core of CLBlast. // // ================================================================================================= #ifndef CLBLAST_TUNING_TUNING_H_ #define CLBLAST_TUNING_TUNING_H_ #include #include #include #include #include #include #include #include "utilities/utilities.hpp" #include "utilities/compile.hpp" #include "utilities/timing.hpp" #include "tuning/configurations.hpp" namespace clblast { // ================================================================================================= // Structures for the tuners with all the default settings struct TunerDefaults { // The list of arguments relevant for this routine std::vector options = {}; // Default sizes size_t default_m = 1; size_t default_n = 1; size_t default_k = 1; size_t channels = 1; size_t height = 1; size_t width = 1; size_t kernel_h = 3; size_t kernel_w = 3; size_t num_kernels = 1; size_t batch_count = 1; // Other defaults size_t default_batch_count = 1; size_t default_num_runs = 10; // run every kernel this many times for averaging double default_fraction = 1.0; }; // Structures for the tuners with the remaining settings struct TunerSettings { // The representative kernel and the source code std::string kernel_family; std::string kernel_name; std::string sources; // Describes how to obtain the sizes of the buffers size_t size_x = 1; size_t size_y = 1; size_t size_a = 1; size_t size_b = 1; size_t size_c = 1; size_t size_temp = 1; // Inputs and outputs (X:0, Y:1, A:2, B:3, C:4, temp:5) std::vector inputs = {}; std::vector outputs = {}; // Sets the base thread configuration std::vector global_size = {}; std::vector global_size_ref = {}; std::vector local_size = {}; std::vector local_size_ref = {}; // Transforms the thread configuration based on the parameters TransformVector mul_local = {}; TransformVector div_local = {}; TransformVector mul_global = {}; TransformVector div_global = {}; // Sets the tuning parameters and their possible values std::vector parameters; // Describes how to compute the performance metrics size_t metric_amount = 0; std::string performance_unit = "N/A"; }; // ================================================================================================= struct TuningResult { std::string name; double score; Configuration config; }; void PrintTimingsToFileAsJSON(const std::string &filename, const Device& device, const Platform& platform, const std::vector> &metadata, const std::vector& tuning_results); void print_separator(const size_t parameters_size); // ================================================================================================= using GetTunerDefaultsFunc = std::function; template using GetTunerSettingsFunc = std::function &args)>; template using TestValidArgumentsFunc = std::function &args)>; using SetConstraintsFunc = std::function(const int V)>; template using ComputeLocalMemSizeFunc = std::function; template using SetArgumentsFunc = std::function &args, std::vector>& buffers)>; // Function to get command-line argument, set-up the input buffers, configure the tuner, and collect // the results. Used for all types of kernel families. Note that this is a header-only function so // that it is automatically compiled for the various kernels (given as the 'C' template argument). template void Tuner(int argc, char* argv[], const int V, GetTunerDefaultsFunc GetTunerDefaults, GetTunerSettingsFunc GetTunerSettings, TestValidArgumentsFunc TestValidArguments, SetConstraintsFunc SetConstraints, ComputeLocalMemSizeFunc ComputeLocalMemSize, SetArgumentsFunc SetArguments); // Function to run the tuners through the CLBlast API, no I/O template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map ¶meters); // ================================================================================================= } // namespace clblast // CLBLAST_TUNING_TUNING_H_ #endif CLBlast-1.6.3/src/tuning/tuning_api.cpp000066400000000000000000000650251463263031500177540ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune). // This is only used for the optional tuner binaries and not part of the core of CLBlast. // // ================================================================================================= #include #include #include #include #include #include "tuning/tuning.hpp" #include "tuning/kernels/xaxpy.hpp" #include "tuning/kernels/xdot.hpp" #include "tuning/kernels/xgemv.hpp" #include "tuning/kernels/xger.hpp" #include "tuning/kernels/xgemm.hpp" #include "tuning/kernels/xgemm_direct.hpp" #include "tuning/kernels/copy_fast.hpp" #include "tuning/kernels/copy_pad.hpp" #include "tuning/kernels/transpose_fast.hpp" #include "tuning/kernels/transpose_pad.hpp" #include "tuning/kernels/invert.hpp" namespace clblast { // ================================================================================================= template StatusCode TuneXaxpy(RawCommandQueue * queue, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.n = n; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, XaxpyGetTunerDefaults, XaxpyGetTunerSettings, XaxpyTestValidArguments, XaxpySetConstraints, XaxpyComputeLocalMemSize, XaxpySetArguments, parameters); } template StatusCode PUBLIC_API TuneXaxpy(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXaxpy(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXaxpy(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXaxpy(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXaxpy(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode TuneXdot(RawCommandQueue * queue, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.n = n; auto queue_cpp = Queue(*queue); auto status = TunerAPI(queue_cpp, args, 1, XdotGetTunerDefaults, XdotGetTunerSettings, XdotTestValidArguments, XdotSetConstraints, XdotComputeLocalMemSize, XdotSetArguments, parameters); if (status != StatusCode::kSuccess) { return status; } return TunerAPI(queue_cpp, args, 2, XdotGetTunerDefaults, XdotGetTunerSettings, XdotTestValidArguments, XdotSetConstraints, XdotComputeLocalMemSize, XdotSetArguments, parameters); } template StatusCode PUBLIC_API TuneXdot(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXdot(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXdot(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXdot(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXdot(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode TuneXgemv(RawCommandQueue * queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; auto queue_cpp = Queue(*queue); auto status = TunerAPI(queue_cpp, args, 1, XgemvGetTunerDefaults, XgemvGetTunerSettings, XgemvTestValidArguments, XgemvSetConstraints, XgemvComputeLocalMemSize, XgemvSetArguments, parameters); if (status != StatusCode::kSuccess) { return status; } status = TunerAPI(queue_cpp, args, 2, XgemvGetTunerDefaults, XgemvGetTunerSettings, XgemvTestValidArguments, XgemvSetConstraints, XgemvComputeLocalMemSize, XgemvSetArguments, parameters); if (status != StatusCode::kSuccess) { return status; } return TunerAPI(queue_cpp, args, 3, XgemvGetTunerDefaults, XgemvGetTunerSettings, XgemvTestValidArguments, XgemvSetConstraints, XgemvComputeLocalMemSize, XgemvSetArguments, parameters); } template StatusCode PUBLIC_API TuneXgemv(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemv(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemv(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemv(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemv(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TuneXger(RawCommandQueue * queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, XgerGetTunerDefaults, XgerGetTunerSettings, XgerTestValidArguments, XgerSetConstraints, XgerComputeLocalMemSize, XgerSetArguments, parameters); } template StatusCode PUBLIC_API TuneXger(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXger(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXger(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXger(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXger(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TuneXgemm(RawCommandQueue * queue, const size_t m, const size_t n, const size_t k, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; args.k = k; auto queue_cpp = Queue(*queue); auto status = TunerAPI(queue_cpp, args, 2, XgemmGetTunerDefaults, XgemmGetTunerSettings, XgemmTestValidArguments, XgemmSetConstraints, XgemmComputeLocalMemSize, XgemmSetArguments, parameters); if (status != StatusCode::kSuccess) { return status; } return TunerAPI(queue_cpp, args, 12, XgemmGetTunerDefaults, XgemmGetTunerSettings, XgemmTestValidArguments, XgemmSetConstraints, XgemmComputeLocalMemSize, XgemmSetArguments, parameters); } template StatusCode PUBLIC_API TuneXgemm(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemm(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemm(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemm(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemm(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TuneXgemmDirect(RawCommandQueue * queue, const size_t m, const size_t n, const size_t k, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; args.k = k; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 2, XgemmDirectGetTunerDefaults, XgemmDirectGetTunerSettings, XgemmDirectTestValidArguments, XgemmDirectSetConstraints, XgemmDirectComputeLocalMemSize, XgemmDirectSetArguments, parameters); } template StatusCode PUBLIC_API TuneXgemmDirect(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemmDirect(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemmDirect(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemmDirect(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemmDirect(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TuneCopy(RawCommandQueue * queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, CopyGetTunerDefaults, CopyGetTunerSettings, CopyTestValidArguments, CopySetConstraints, CopyComputeLocalMemSize, CopySetArguments, parameters); } template StatusCode PUBLIC_API TuneCopy(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneCopy(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneCopy(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneCopy(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneCopy(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TunePad(RawCommandQueue * queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, PadGetTunerDefaults, PadGetTunerSettings, PadTestValidArguments, PadSetConstraints, PadComputeLocalMemSize, PadSetArguments, parameters); } template StatusCode PUBLIC_API TunePad(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePad(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePad(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePad(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePad(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TuneTranspose(RawCommandQueue * queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, TransposeGetTunerDefaults, TransposeGetTunerSettings, TransposeTestValidArguments, TransposeSetConstraints, TransposeComputeLocalMemSize, TransposeSetArguments, parameters); } template StatusCode PUBLIC_API TuneTranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneTranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneTranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneTranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneTranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TunePadtranspose(RawCommandQueue * queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, PadtransposeGetTunerDefaults, PadtransposeGetTunerSettings, PadtransposeTestValidArguments, PadtransposeSetConstraints, PadtransposeComputeLocalMemSize, PadtransposeSetArguments, parameters); } template StatusCode PUBLIC_API TunePadtranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePadtranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePadtranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePadtranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePadtranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TuneInvert(RawCommandQueue * queue, const size_t m, const size_t n, const size_t k, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; args.k = k; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, InvertGetTunerDefaults, InvertGetTunerSettings, InvertTestValidArguments, InvertSetConstraints, InvertComputeLocalMemSize, InvertSetArguments, parameters); } template StatusCode PUBLIC_API TuneInvert(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneInvert(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneInvert(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneInvert(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneInvert(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); // ================================================================================================= // The main tuner API, similar to the one in tuning.cpp, but without I/O template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map ¶meters) { // Sets the parameters and platform/device for which to tune (command-line options) const TunerDefaults defaults = GetTunerDefaults(V); const TunerSettings settings = GetTunerSettings(V, args); // Tests validity of the given arguments TestValidArguments(V, args); // Retrieves OpenCL classes const auto device = queue.GetDevice(); const auto context = queue.GetContext(); // Inspects whether or not FP64 is supported in case of double precision if ((PrecisionValue() == Precision::kDouble && !PrecisionSupported(device)) || (PrecisionValue() == Precision::kComplexDouble && !PrecisionSupported(device))) { return StatusCode::kNoDoublePrecision; } // As above, but for FP16 (half precision) if (PrecisionValue() == Precision::kHalf && !PrecisionSupported(device)) { return StatusCode::kNoHalfPrecision; } // Retrieves properties const auto device_type = GetDeviceType(device); const auto device_vendor = GetDeviceVendor(device); const auto device_architecture = GetDeviceArchitecture(device); const auto device_name = GetDeviceName(device); // Creates input buffers with random data. Adds a 'canary' region to detect buffer overflows. const auto buffer_sizes = std::vector{ settings.size_x + kCanarySize, settings.size_y + kCanarySize, settings.size_a + kCanarySize, settings.size_b + kCanarySize, settings.size_c + kCanarySize, settings.size_temp + kCanarySize }; const auto seed = static_cast(time(nullptr)); std::mt19937 mt(seed); std::uniform_real_distribution dist(kTestDataLowerLimit, kTestDataUpperLimit); auto source_buffers = std::vector>(); auto reference_buffers = std::vector>(); auto result_buffers = std::vector>(); auto device_buffers = std::vector>(); for (const auto size : buffer_sizes) { auto host_buffer = std::vector(size); PopulateVector(host_buffer, mt, dist); source_buffers.push_back(host_buffer); reference_buffers.push_back(std::vector(size)); result_buffers.push_back(std::vector(size)); device_buffers.push_back(Buffer(context, size)); } // Sets the tunable parameters and their possible values auto configurations = SetConfigurations(device, settings.parameters, settings.local_size, settings.mul_local, settings.div_local, SetConstraints(V), ComputeLocalMemSize(V)); // Select the search method (full search or a random fraction) if (args.fraction != 0.0 && args.fraction != 1.0) { const auto new_size = static_cast(configurations.size() * args.fraction); auto rng = std::default_random_engine{}; std::shuffle(std::begin(configurations), std::end(configurations), rng); configurations.resize(new_size); } // First runs a reference example to compare against try { // Sets the input for (const auto id : settings.inputs) { device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); } // Compiles the kernel auto compiler_options = std::vector(); const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name, device, context, compiler_options, 0); auto kernel = Kernel(program, settings.kernel_name); SetArguments(V, kernel, args, device_buffers); // Runs the kernel const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, settings.global_size_ref, settings.local_size_ref, true); if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); } // Saves the result for (const auto id : settings.outputs) { device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]); } } catch (...) { const auto status_code = DispatchExceptionCatchAll(true); return status_code; } // Starts the tuning process auto results = std::vector(); for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) { try { auto configuration = configurations[config_id]; // Sets the input for (const auto id : settings.inputs) { device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); } // Sets the thread configuration const auto global = SetThreadConfiguration(configuration, settings.global_size, settings.mul_global, settings.div_global); const auto local = SetThreadConfiguration(configuration, settings.local_size, settings.mul_local, settings.div_local); // Sets the parameters for this configuration auto kernel_source = std::string{""}; for (const auto ¶meter : configuration) { kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n"; } kernel_source += settings.sources; // Compiles the kernel auto compiler_options = std::vector(); const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name, device, context, compiler_options, 0, true); auto kernel = Kernel(program, settings.kernel_name); // Runs the kernel SetArguments(V, kernel, args, device_buffers); const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local, true); // Kernel run was not successful if (time_ms == -1.0) { continue; } // Compares the results auto l2_error = 0.0; for (const auto id : settings.outputs) { device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]); for (auto index = size_t{0}; index(buffer_sizes[id]); if (std::isnan(l2_error) || l2_error > 1.0e-4) { throw std::runtime_error("L2 error too large"); } } results.push_back(TuningResult{settings.kernel_name, time_ms, configuration}); } catch (...) { } } // Completed the tuning process if (results.size() == 0) { return StatusCode::kUnexpectedError; } // Computes the best results auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; const auto best_configuration = std::min_element(results.begin(), results.end(), comparison); const auto best_time_ms = best_configuration->score; if (best_time_ms == 0.0) { return StatusCode::kUnexpectedError; } // Stores the best parameters for (const auto& config : best_configuration->config) { parameters[config.first] = config.second; } return StatusCode::kSuccess; } // Compiles the above function template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map&); template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map&); template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map&); template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map&); template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map&); // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/utilities/000077500000000000000000000000001463263031500156125ustar00rootroot00000000000000CLBlast-1.6.3/src/utilities/android.hpp000066400000000000000000000031521463263031500177440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file provides macro's and definitions to make compilation work for Android. Note that this // header should only be included when compiling for Android, e.g. when __ANDROID__ is defined. // // ================================================================================================= #ifndef CLBLAST_ANDROID_HPP_ #define CLBLAST_ANDROID_HPP_ #ifndef __clang__ // not to include custom impl to avoid ambiguous definition // ================================================================================================= #include #include #include namespace std { // No support for these standard library functions when compiling with the GNU C++ STL template std::string to_string(T value) { std::ostringstream os; os << value; return os.str(); } inline double stod(const std::string& value) { return std::atof(value.c_str()); } inline int stoi( const std::string& str, std::size_t* pos = 0, int base = 10) { char * p_end; const auto result = std::strtol(str.c_str(), &p_end, base); return result; } } // ================================================================================================= #endif // clang header guard // CLBLAST_ANDROID_HPP_ #endif CLBlast-1.6.3/src/utilities/buffer_test.hpp000066400000000000000000000177571463263031500206540ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are // templated and thus header-only. // // ================================================================================================= #ifndef CLBLAST_BUFFER_TEST_H_ #define CLBLAST_BUFFER_TEST_H_ #include #include #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= // Tests matrix 'A' for validity template void TestMatrixA(const size_t one, const size_t two, const Buffer &buffer, const size_t offset, const size_t ld, const bool test_lead_dim = true) { if (test_lead_dim && ld < one) { throw BLASError(StatusCode::kInvalidLeadDimA); } try { const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); } } catch (const Error &e) { throw BLASError(StatusCode::kInvalidMatrixA, e.what()); } } // Tests matrix 'B' for validity template void TestMatrixB(const size_t one, const size_t two, const Buffer &buffer, const size_t offset, const size_t ld, const bool test_lead_dim = true) { if (test_lead_dim && ld < one) { throw BLASError(StatusCode::kInvalidLeadDimB); } try { const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryB); } } catch (const Error &e) { throw BLASError(StatusCode::kInvalidMatrixB, e.what()); } } // Tests matrix 'C' for validity template void TestMatrixC(const size_t one, const size_t two, const Buffer &buffer, const size_t offset, const size_t ld) { if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimC); } try { const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryC); } } catch (const Error &e) { throw BLASError(StatusCode::kInvalidMatrixC, e.what()); } } // Tests matrix 'AP' for validity template void TestMatrixAP(const size_t n, const Buffer &buffer, const size_t offset) { try { const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T); if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); } } catch (const Error &e) { throw BLASError(StatusCode::kInvalidMatrixA, e.what()); } } // ================================================================================================= // Tests vector 'X' for validity template void TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, const size_t inc) { if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementX); } try { const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T); if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryX); } } catch (const Error &e) { throw BLASError(StatusCode::kInvalidVectorX, e.what()); } } // Tests vector 'Y' for validity template void TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, const size_t inc) { if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementY); } try { const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T); if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryY); } } catch (const Error &e) { throw BLASError(StatusCode::kInvalidVectorY, e.what()); } } // ================================================================================================= // Tests vector 'scalar' for validity template void TestVectorScalar(const size_t n, const Buffer &buffer, const size_t offset) { try { const auto required_size = (n + offset) * sizeof(T); if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryScalar); } } catch (const Error &e) { throw BLASError(StatusCode::kInvalidVectorScalar, e.what()); } } // Tests vector 'index' for validity template void TestVectorIndex(const size_t n, const Buffer &buffer, const size_t offset) { try { const auto required_size = (n + offset) * sizeof(T); if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryScalar); } } catch (const Error &e) { throw BLASError(StatusCode::kInvalidVectorScalar, e.what()); } } // ================================================================================================= // Tests matrix 'A' for validity in a batched setting template void TestBatchedMatrixA(const size_t one, const size_t two, const Buffer& buffer, const std::vector &offsets, const size_t ld, const bool test_lead_dim = true) { const auto max_offset = *std::max_element(offsets.begin(), offsets.end()); TestMatrixA(one, two, buffer, max_offset, ld, test_lead_dim); } // Tests matrix 'B' for validity in a batched setting template void TestBatchedMatrixB(const size_t one, const size_t two, const Buffer& buffer, const std::vector& offsets, const size_t ld, const bool test_lead_dim = true) { const auto max_offset = *std::max_element(offsets.begin(), offsets.end()); TestMatrixB(one, two, buffer, max_offset, ld, test_lead_dim); } // Tests matrix 'C' for validity in a batched setting template void TestBatchedMatrixC(const size_t one, const size_t two, const Buffer& buffer, const std::vector& offsets, const size_t ld) { const auto max_offset = *std::max_element(offsets.begin(), offsets.end()); TestMatrixC(one, two, buffer, max_offset, ld); } // ================================================================================================= // Tests matrix 'A' for validity in a strided batched setting template void TestStridedBatchedMatrixA(const size_t one, const size_t two, const Buffer& buffer, const size_t offset, const size_t stride, const size_t batch_count, const size_t ld, const bool test_lead_dim = true) { const auto last_batch_offset = (batch_count - 1) * stride; TestMatrixA(one, two, buffer, offset + last_batch_offset, ld, test_lead_dim); } // Tests matrix 'B' for validity in a strided batched setting template void TestStridedBatchedMatrixB(const size_t one, const size_t two, const Buffer& buffer, const size_t offset, const size_t stride, const size_t batch_count, const size_t ld, const bool test_lead_dim = true) { const auto last_batch_offset = (batch_count - 1) * stride; TestMatrixB(one, two, buffer, offset + last_batch_offset, ld, test_lead_dim); } // Tests matrix 'C' for validity in a strided batched setting template void TestStridedBatchedMatrixC(const size_t one, const size_t two, const Buffer& buffer, const size_t offset, const size_t stride, const size_t batch_count, const size_t ld) { const auto last_batch_offset = (batch_count - 1) * stride; TestMatrixC(one, two, buffer, offset + last_batch_offset, ld); } // ================================================================================================= } // namespace clblast // CLBLAST_BUFFER_TEST_H_ #endif CLBlast-1.6.3/src/utilities/clblast_exceptions.cpp000066400000000000000000000070761463263031500222150ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Ivan Shapovalov // // This file implements the exception hierarchy for CLBlast. It contains classes for exceptions // generated by different parts of CLBlast (e.g. OpenCL API calls, internal logic, semantic BLAS // errors). // // ================================================================================================= #include "utilities/clblast_exceptions.hpp" namespace { // ================================================================================================= std::string MakeReason(const std::string &reason, const std::string &subreason) { std::string r = reason; if (!subreason.empty()) { r += " (" + subreason + ")"; } return r; } } // anonymous namespace namespace clblast { // ================================================================================================= BLASError::BLASError(StatusCode status, const std::string &subreason): ErrorCode(status, subreason, "BLAS error: " + MakeReason(std::to_string(static_cast(status)), subreason)) { } RuntimeErrorCode::RuntimeErrorCode(StatusCode status, const std::string &subreason): ErrorCode(status, subreason, MakeReason(std::to_string(static_cast(status)), subreason)) { } // ================================================================================================= StatusCode DispatchException(const bool silent) { const char *message = nullptr; StatusCode status; try { throw; } catch (BLASError &e) { // no message is printed for invalid argument errors status = e.status(); } catch (CLCudaAPIError &e) { message = e.what(); status = static_cast(e.status()); } catch (RuntimeErrorCode &e) { message = e.what(); status = e.status(); } catch (Error &e) { message = e.what(); status = StatusCode::kUnknownError; } if (message && !silent) { fprintf(stderr, "CLBlast: %s\n", message); } return status; } StatusCode DispatchExceptionCatchAll(const bool silent) { const char *message = nullptr; StatusCode status; try { throw; } catch (BLASError &e) { // no message is printed for invalid argument errors status = e.status(); } catch (CLCudaAPIError &e) { message = e.what(); status = static_cast(e.status()); } catch (RuntimeErrorCode &e) { message = e.what(); status = e.status(); } catch (Error &e) { message = e.what(); status = StatusCode::kUnknownError; } catch (...) { message = "unknown exception type"; status = StatusCode::kUnknownError; } if (message && !silent) { fprintf(stderr, "CLBlast: %s\n", message); } return status; } // ================================================================================================= StatusCode DispatchExceptionForC() { const char *message = nullptr; try { throw; } catch (std::exception &e) { message = e.what(); } catch (...) { message = "unknown exception"; } fprintf (stderr, "CLBlast (unexpected): %s\n", message); return StatusCode::kUnexpectedError; } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/utilities/clblast_exceptions.hpp000066400000000000000000000040401463263031500222060ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Ivan Shapovalov // // This file implements the exception hierarchy for CLBlast. It contains classes for exceptions // generated by different parts of CLBlast (e.g. OpenCL API calls, internal logic, semantic BLAS // errors). // // ================================================================================================= #ifndef CLBLAST_EXCEPTIONS_H_ #define CLBLAST_EXCEPTIONS_H_ #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= // Represents a semantic error in BLAS function arguments class BLASError : public ErrorCode, StatusCode> { public: explicit BLASError(StatusCode status, const std::string &subreason = std::string{}); }; // ================================================================================================= // Represents a runtime error generated by internal logic class RuntimeErrorCode : public ErrorCode { public: explicit RuntimeErrorCode(StatusCode status, const std::string &subreason = std::string{}); }; // ================================================================================================= // Handles (most of the) runtime exceptions and converts them to StatusCode StatusCode DispatchException(const bool silent = false); StatusCode DispatchExceptionCatchAll(const bool silent = false); // Handles remaining exceptions and converts them to StatusCode::kUnhandledError StatusCode DispatchExceptionForC(); // ================================================================================================= } // namespace clblast #endif // CLBLAST_EXCEPTIONS_H_ CLBlast-1.6.3/src/utilities/compile.cpp000066400000000000000000000132211463263031500177450ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the kernel compilation functions (see the header for more information). // // ================================================================================================= #include #include #include "routines/common.hpp" #include "kernel_preprocessor.hpp" namespace clblast { // ================================================================================================= // Compiles a program from source code std::shared_ptr CompileFromSource( const std::string &source_string, const Precision precision, const std::string &routine_name, const Device& device, const Context& context, std::vector& options, const size_t run_preprocessor, // 0: platform dependent, 1: always, 2: never const bool silent) { auto header_string = std::string{""}; header_string += "#define PRECISION " + ToString(static_cast(precision)) + "\n"; // Adds the name of the routine as a define header_string += "#define ROUTINE_" + routine_name + "\n"; // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on // which it is known to work with all OpenCL platforms. if (device.IsNVIDIA() || device.IsARM() || device.IsQualcomm()) { header_string += "#define USE_INLINE_KEYWORD 1\n"; } // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve // performance, but might result in a reduced accuracy. if ((device.IsAMD() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) { header_string += "#define USE_CL_MAD 1\n"; } // For specific devices, use staggered/shuffled workgroup indices. if (device.IsAMD() && device.IsGPU()) { header_string += "#define USE_STAGGERED_INDICES 1\n"; } // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize // performance through better cache behaviour if ((device.IsARM() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) { header_string += "#define GLOBAL_MEM_FENCE 1\n"; } // For Intel GPUs with subgroup support, use subgroup shuffling. if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups) && (precision == Precision::kSingle || precision == Precision::kHalf)) { header_string += "#define USE_SUBGROUP_SHUFFLING 1\n"; header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n"; } // For NVIDIA GPUs, inline PTX can provide subgroup support if (device.IsGPU() && device.IsNVIDIA() && precision == Precision::kSingle) { header_string += "#define USE_SUBGROUP_SHUFFLING 1\n"; // Nvidia needs to check pre or post volta due to new shuffle commands if (device.IsPostNVIDIAVolta()) { header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 1\n"; } else { header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n"; } } // For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance. // This option compiles without the workgroup size requirement and does not affect correctness. if (device.IsQualcomm()) { header_string += "#define RELAX_WORKGROUP_SIZE 1\n"; } // Optionally adds a translation header from OpenCL kernels to CUDA kernels #ifdef CUDA_API header_string += #include "kernels/opencl_to_cuda.h" ; #endif // Loads the common header (typedefs and defines and such) header_string += #include "kernels/common.opencl" ; // Prints details of the routine to compile in case of debugging in verbose mode #ifdef VERBOSE printf("[DEBUG] Compiling routine '%s-%s'\n", routine_name.c_str(), ToString(precision).c_str()); const auto start_time = std::chrono::steady_clock::now(); #endif // Runs a pre-processor to unroll loops and perform array-to-register promotion. Most OpenCL // compilers do this, but some don't. auto do_run_preprocessor = false; if (run_preprocessor == 0) { do_run_preprocessor = (device.IsARM() && device.IsGPU()); } if (run_preprocessor == 1) { do_run_preprocessor = true; } auto kernel_string = header_string + source_string; if (do_run_preprocessor) { log_debug("Running built-in pre-processor"); kernel_string = PreprocessKernelSource(kernel_string); } // Compiles the kernel auto program = std::make_shared(context, kernel_string); try { SetOpenCLKernelStandard(device, options); program->Build(device, options); } catch (const CLCudaAPIBuildError &e) { if (program->StatusIsCompilationWarningOrError(e.status()) && !silent) { fprintf(stdout, "OpenCL compiler error/warning:\n%s\n", program->GetBuildInfo(device).c_str()); } throw; } // Prints the elapsed compilation time in case of debugging in verbose mode #ifdef VERBOSE const auto elapsed_time = std::chrono::steady_clock::now() - start_time; const auto timing = std::chrono::duration(elapsed_time).count(); printf("[DEBUG] Completed compilation in %.2lf ms\n", timing); #endif return program; } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/utilities/compile.hpp000066400000000000000000000030551463263031500177560ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the CLBlast way to compile a kernel from source, used for the library and for // the auto-tuners. // // ================================================================================================= #ifndef CLBLAST_UTILITIES_COMPILE_H_ #define CLBLAST_UTILITIES_COMPILE_H_ #include #include #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= // Compiles a program from source code std::shared_ptr CompileFromSource( const std::string &source_string, const Precision precision, const std::string &routine_name, const Device& device, const Context& context, std::vector& options, const size_t run_preprocessor, // 0: platform dependent, 1: always, 2: never const bool silent = false); // ================================================================================================= } // namespace clblast // CLBLAST_UTILITIES_COMPILE_H_ #endif CLBlast-1.6.3/src/utilities/device_mapping.hpp000066400000000000000000000036351463263031500213040ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file describes the mappings of extracted names from OpenCL (device, board, vendor, etc.) to // more commonly used names to match devices from different vendors and platforms properly. // // ================================================================================================= #ifndef CLBLAST_UTILITIES_DEVICE_MAPPING_H_ #define CLBLAST_UTILITIES_DEVICE_MAPPING_H_ #include #include namespace clblast { // A special namespace to hold all the global constant variables namespace device_mapping { // ================================================================================================= // Alternative names for some vendor names (top-level) const std::unordered_map kVendorNames { { "Intel(R) Corporation", "Intel" }, { "GenuineIntel", "Intel" }, { "Advanced Micro Devices, Inc.", "AMD" }, { "NVIDIA Corporation", "NVIDIA" }, }; // Alternative names for some architectures (mid-level) const std::unordered_map kArchitectureNames { {"gfx803", "Fiji"}, {"gfx900", "Vega"}, }; // Alternative names for some devices (low-level) const std::unordered_map kDeviceNames { // Empty }; // Things to remove from device names (low-level) const std::vector kDeviceRemovals { "pthread-" }; // ================================================================================================= } // namespace device_mapping } // namespace clblast // CLBLAST_UTILITIES_DEVICE_MAPPING_H_ #endif CLBlast-1.6.3/src/utilities/msvc.hpp000066400000000000000000000027371463263031500173040ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file provides macro's and definitions to make compilation work on Microsoft Visual Studio, // in particular for versions older than 2015 with limited C++11 support. // MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015) // MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013) // MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012) // MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010) // MSVC++ 9.0 _MSC_VER == 1500 (Visual Studio 2008) // // ================================================================================================= #ifndef CLBLAST_MSVC_HPP_ #define CLBLAST_MSVC_HPP_ namespace clblast { // ================================================================================================= #ifdef _MSC_VER // No support for constexpr prior to 2015. Note that this only works with constants, not with // constexpr functions (unused in this project). #if _MSC_VER < 1900 #define constexpr const #endif // _MSC_VER #endif // ================================================================================================= } // namespace clblast // CLBLAST_MSVC_HPP_ #endif CLBlast-1.6.3/src/utilities/timing.cpp000066400000000000000000000056631463263031500176170ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file provides helper functions for time measurement and such. // // ================================================================================================= #include #include #include "utilities/timing.hpp" namespace clblast { // ================================================================================================= double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, std::vector global, const std::vector &local) { auto event = Event(); if (!local.empty()) { // Tests for validity of the local thread sizes if (local.size() > device.MaxWorkItemDimensions()) { throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions); } const auto max_work_item_sizes = device.MaxWorkItemSizes(); for (auto i=size_t{0}; i max_work_item_sizes[i]) { throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim); } } auto local_size = size_t{1}; for (auto &item: local) { local_size *= item; } if (local_size > device.MaxWorkGroupSize()) { throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal); } // Make sure the global thread sizes are at least equal to the local sizes for (auto i=size_t{0}; i global, const std::vector &local, const bool silent) { try { const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local); if (!silent) { printf(" %9.2lf ms |", time_ms); } return time_ms; } catch (...) { const auto status_code = DispatchExceptionCatchAll(true); if (!silent) { printf(" error %-5d |", static_cast(status_code)); } return -1.0; // invalid } } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/utilities/timing.hpp000066400000000000000000000061531463263031500176170ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file provides helper functions for time measurement and such. // // ================================================================================================= #ifndef CLBLAST_TIMING_H_ #define CLBLAST_TIMING_H_ #include #include #include #include #include #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= template double TimeFunction(const size_t num_runs, F const &function) { function(); // warm-up auto timings = std::vector(num_runs); for (auto &timing: timings) { const auto start_time = std::chrono::steady_clock::now(); function(); const auto elapsed_time = std::chrono::steady_clock::now() - start_time; timing = std::chrono::duration(elapsed_time).count(); } return *std::min_element(timings.begin(), timings.end()); } // ================================================================================================= double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, std::vector global, const std::vector &local); double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, std::vector global, const std::vector &local, const bool silent = false); // ================================================================================================= using Timing = std::pair; template std::vector TimeRoutine(const size_t from, const size_t to, const size_t step, const size_t num_runs, Queue& queue, const std::vector>& buffers, F const &routine) { auto timings = std::vector(); printf("| value | time |\n"); printf("x--------x--------------x\n"); for (auto value = from; value < to; value += step) { printf("| %6zu |", value); try { const auto FunctionToTune = [&]() { routine(value, queue, buffers); }; const auto time_ms = TimeFunction(num_runs, FunctionToTune); printf(" %9.2lf ms |\n", time_ms); timings.push_back({value, time_ms}); } catch (...) { const auto status_code = DispatchExceptionCatchAll(true); printf(" error %-5d |\n", static_cast(status_code)); timings.push_back({value, -1.0}); // invalid } } printf("x--------x--------------x\n"); return timings; } // ================================================================================================= } // namespace clblast // CLBLAST_TIMING_H_ #endif CLBlast-1.6.3/src/utilities/utilities.cpp000066400000000000000000000541271463263031500203420ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the common utility functions. // // ================================================================================================= #include #include #include #include #include #include #include "utilities/utilities.hpp" #include "utilities/device_mapping.hpp" namespace clblast { // ================================================================================================= // Returns a scalar with a default value template T GetScalar() { return static_cast(2.0); } template float GetScalar(); template double GetScalar(); template <> half GetScalar() { return FloatToHalf(2.0f); } template <> float2 GetScalar() { return {2.0f, 0.5f}; } template <> double2 GetScalar() { return {2.0, 0.5}; } // Returns a scalar of value 0 template T ConstantZero() { return static_cast(0.0); } template float ConstantZero(); template double ConstantZero(); template <> half ConstantZero() { return FloatToHalf(0.0f); } template <> float2 ConstantZero() { return {0.0f, 0.0f}; } template <> double2 ConstantZero() { return {0.0, 0.0}; } // Returns a scalar of value 1 template T ConstantOne() { return static_cast(1.0); } template float ConstantOne(); template double ConstantOne(); template <> half ConstantOne() { return FloatToHalf(1.0f); } template <> float2 ConstantOne() { return {1.0f, 0.0f}; } template <> double2 ConstantOne() { return {1.0, 0.0}; } // Returns a scalar of value -1 template T ConstantNegOne() { return static_cast(-1.0); } template float ConstantNegOne(); template double ConstantNegOne(); template <> half ConstantNegOne() { return FloatToHalf(-1.0f); } template <> float2 ConstantNegOne() { return {-1.0f, 0.0f}; } template <> double2 ConstantNegOne() { return {-1.0, 0.0}; } // Returns a scalar of some value template T Constant(const double val) { return static_cast(val); } template float Constant(const double); template double Constant(const double); template <> half Constant(const double val) { return FloatToHalf(static_cast(val)); } template <> float2 Constant(const double val) { return {static_cast(val), 0.0f}; } template <> double2 Constant(const double val) { return {val, 0.0}; } // Returns a small scalar value just larger than 0 template T SmallConstant() { return static_cast(1e-4); } template float SmallConstant(); template double SmallConstant(); template <> half SmallConstant() { return FloatToHalf(1e-4f); } template <> float2 SmallConstant() { return {1e-4f, 0.0f}; } template <> double2 SmallConstant() { return {1e-4, 0.0}; } // Returns the absolute value of a scalar (modulus in case of a complex number) template typename BaseType::Type AbsoluteValue(const T value) { return std::fabs(value); } template float AbsoluteValue(const float); template double AbsoluteValue(const double); template <> half AbsoluteValue(const half value) { return FloatToHalf(std::fabs(HalfToFloat(value))); } template <> float AbsoluteValue(const float2 value) { if (value.real() == 0.0f && value.imag() == 0.0f) { return 0.0f; } return std::sqrt(value.real() * value.real() + value.imag() * value.imag()); } template <> double AbsoluteValue(const double2 value) { if (value.real() == 0.0 && value.imag() == 0.0) { return 0.0; } return std::sqrt(value.real() * value.real() + value.imag() * value.imag()); } // ================================================================================================= // Implements the string conversion using std::to_string if possible template std::string ToString(T value) { return std::to_string(value); } template std::string ToString(int value); template std::string ToString(size_t value); template <> std::string ToString(float value) { std::ostringstream result; result << std::fixed << std::setprecision(2) << value; return result.str(); } template <> std::string ToString(double value) { std::ostringstream result; result << std::fixed << std::setprecision(2) << value; return result.str(); } template <> std::string ToString(std::string value) { return value; } // If not possible directly: special cases for complex data-types template <> std::string ToString(float2 value) { return ToString(value.real())+"+"+ToString(value.imag())+"i"; } template <> std::string ToString(double2 value) { return ToString(value.real())+"+"+ToString(value.imag())+"i"; } // If not possible directly: special case for half-precision template <> std::string ToString(half value) { return std::to_string(HalfToFloat(value)); } // If not possible directly: special cases for CLBlast data-types template <> std::string ToString(Layout value) { switch(value) { case Layout::kRowMajor: return ToString(static_cast(value))+" (row-major)"; case Layout::kColMajor: return ToString(static_cast(value))+" (col-major)"; } } template <> std::string ToString(Transpose value) { switch(value) { case Transpose::kNo: return ToString(static_cast(value))+" (regular)"; case Transpose::kYes: return ToString(static_cast(value))+" (transposed)"; case Transpose::kConjugate: return ToString(static_cast(value))+" (conjugate)"; } } template <> std::string ToString(Side value) { switch(value) { case Side::kLeft: return ToString(static_cast(value))+" (left)"; case Side::kRight: return ToString(static_cast(value))+" (right)"; } } template <> std::string ToString(Triangle value) { switch(value) { case Triangle::kUpper: return ToString(static_cast(value))+" (upper)"; case Triangle::kLower: return ToString(static_cast(value))+" (lower)"; } } template <> std::string ToString(Diagonal value) { switch(value) { case Diagonal::kUnit: return ToString(static_cast(value))+" (unit)"; case Diagonal::kNonUnit: return ToString(static_cast(value))+" (non-unit)"; } } template <> std::string ToString(Precision value) { switch(value) { case Precision::kHalf: return ToString(static_cast(value))+" (half)"; case Precision::kSingle: return ToString(static_cast(value))+" (single)"; case Precision::kDouble: return ToString(static_cast(value))+" (double)"; case Precision::kComplexSingle: return ToString(static_cast(value))+" (complex-single)"; case Precision::kComplexDouble: return ToString(static_cast(value))+" (complex-double)"; case Precision::kAny: return ToString(static_cast(value))+" (any)"; } } template <> std::string ToString(KernelMode value) { switch(value) { case KernelMode::kCrossCorrelation: return ToString(static_cast(value))+" (cross-correlation)"; case KernelMode::kConvolution: return ToString(static_cast(value))+" (convolution)"; } } template <> std::string ToString(StatusCode value) { return std::to_string(static_cast(value)); } // ================================================================================================= // Retrieves the command-line arguments in a C++ fashion. Also adds command-line arguments from // pre-defined environmental variables std::vector RetrieveCommandLineArguments(int argc, char *argv[]) { // Regular command-line arguments auto command_line_args = std::vector(); for (auto i=0; i T ConvertArgument(const char* value) { return static_cast(std::stoi(value)); } template size_t ConvertArgument(const char* value); template <> std::string ConvertArgument(const char* value) { return std::string{value}; } template <> half ConvertArgument(const char* value) { return FloatToHalf(static_cast(std::stod(value))); } template <> float ConvertArgument(const char* value) { return static_cast(std::stod(value)); } template <> double ConvertArgument(const char* value) { return static_cast(std::stod(value)); } template <> float2 ConvertArgument(const char* value) { auto val = static_cast(std::stod(value)); return float2{val, val}; } template <> double2 ConvertArgument(const char* value) { auto val = static_cast(std::stod(value)); return double2{val, val}; } // Variant of "ConvertArgument" with default values template T ConvertArgument(const char* value, T default_value) { if (value) { return ConvertArgument(value); } return default_value; } template size_t ConvertArgument(const char* value, size_t default_value); template std::string ConvertArgument(const char* value, std::string default_value); // This function matches patterns in the form of "-option value" or "--option value". It returns a // default value in case the option is not found in the argument string. template T GetArgument(const std::vector &arguments, std::string &help, const std::string &option, const T default_value) { // Parses the argument. Note that this supports both the given option (e.g. -device) and one with // an extra dash in front (e.g. --device). auto return_value = static_cast(default_value); for (auto c=size_t{0}; c(arguments[c].c_str()); break; } } // Updates the help message and returns help += " -"+option+" "+ToString(return_value)+" "; help += (return_value == default_value) ? "[=default]\n" : "\n"; return return_value; } // Compiles the above function template int GetArgument(const std::vector&, std::string&, const std::string&, const int); template size_t GetArgument(const std::vector&, std::string&, const std::string&, const size_t); template half GetArgument(const std::vector&, std::string&, const std::string&, const half); template float GetArgument(const std::vector&, std::string&, const std::string&, const float); template double GetArgument(const std::vector&, std::string&, const std::string&, const double); template float2 GetArgument(const std::vector&, std::string&, const std::string&, const float2); template double2 GetArgument(const std::vector&, std::string&, const std::string&, const double2); template std::string GetArgument(const std::vector&, std::string&, const std::string&, const std::string); template Layout GetArgument(const std::vector&, std::string&, const std::string&, const Layout); template Transpose GetArgument(const std::vector&, std::string&, const std::string&, const Transpose); template Side GetArgument(const std::vector&, std::string&, const std::string&, const Side); template Triangle GetArgument(const std::vector&, std::string&, const std::string&, const Triangle); template Diagonal GetArgument(const std::vector&, std::string&, const std::string&, const Diagonal); template Precision GetArgument(const std::vector&, std::string&, const std::string&, const Precision); template KernelMode GetArgument(const std::vector&, std::string&, const std::string&, const KernelMode); // ================================================================================================= // Returns only the precision argument Precision GetPrecision(const std::vector &arguments, const Precision default_precision) { auto dummy = std::string{}; return GetArgument(arguments, dummy, kArgPrecision, default_precision); } // ================================================================================================= // Checks whether an argument is given. Returns true or false. bool CheckArgument(const std::vector &arguments, std::string &help, const std::string &option) { // Parses the argument. Note that this supports both the given option (e.g. -device) and one with // an extra dash in front (e.g. --device). auto return_value = false; for (auto c=size_t{0}; c void PopulateVector(std::vector &vector, std::mt19937 &mt, std::uniform_real_distribution &dist) { for (auto &element: vector) { element = static_cast(dist(mt)); } } template void PopulateVector(std::vector&, std::mt19937&, std::uniform_real_distribution&); template void PopulateVector(std::vector&, std::mt19937&, std::uniform_real_distribution&); // Specialized versions of the above for complex data-types template <> void PopulateVector(std::vector &vector, std::mt19937 &mt, std::uniform_real_distribution &dist) { for (auto &element: vector) { element.real(static_cast(dist(mt))); element.imag(static_cast(dist(mt))); } } template <> void PopulateVector(std::vector &vector, std::mt19937 &mt, std::uniform_real_distribution &dist) { for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } } // Specialized versions of the above for half-precision template <> void PopulateVector(std::vector &vector, std::mt19937 &mt, std::uniform_real_distribution &dist) { for (auto &element: vector) { element = FloatToHalf(static_cast(dist(mt))); } } // ================================================================================================= // Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is // no conversion, but half-precision is not supported as kernel argument so it is converted to float. template <> typename RealArg::Type GetRealArg(const half value) { return HalfToFloat(value); } template <> typename RealArg::Type GetRealArg(const float value) { return value; } template <> typename RealArg::Type GetRealArg(const double value) { return value; } template <> typename RealArg::Type GetRealArg(const float2 value) { return value; } template <> typename RealArg::Type GetRealArg(const double2 value) { return value; } // ================================================================================================= // Rounding functions performing ceiling and division operations size_t CeilDiv(const size_t x, const size_t y) { return 1 + ((x - 1) / y); } size_t Ceil(const size_t x, const size_t y) { return CeilDiv(x,y)*y; } // Helper function to determine whether or not 'a' is a multiple of 'b' bool IsMultiple(const size_t a, const size_t b) { return ((a/b)*b == a) ? true : false; } // ================================================================================================= // Convert the precision enum (as integer) into bytes size_t GetBytes(const Precision precision) { switch(precision) { case Precision::kHalf: return 2; case Precision::kSingle: return 4; case Precision::kDouble: return 8; case Precision::kComplexSingle: return 8; case Precision::kComplexDouble: return 16; case Precision::kAny: return -1; } } // Convert the template argument into a precision value template <> Precision PrecisionValue() { return Precision::kHalf; } template <> Precision PrecisionValue() { return Precision::kSingle; } template <> Precision PrecisionValue() { return Precision::kDouble; } template <> Precision PrecisionValue() { return Precision::kComplexSingle; } template <> Precision PrecisionValue() { return Precision::kComplexDouble; } // ================================================================================================= // Returns false is this precision is not supported by the device template <> bool PrecisionSupported(const Device &) { return true; } template <> bool PrecisionSupported(const Device &) { return true; } template <> bool PrecisionSupported(const Device &device) { return device.SupportsFP64(); } template <> bool PrecisionSupported(const Device &device) { return device.SupportsFP64(); } template <> bool PrecisionSupported(const Device &device) { return device.SupportsFP16(); } // ================================================================================================= // Retrieves the squared difference, used for example for computing the L2 error template double SquaredDifference(const T val1, const T val2) { const auto difference = (val1 - val2); return static_cast(difference * difference); } // Compiles the default case for standard data-types template double SquaredDifference(const float, const float); template double SquaredDifference(const double, const double); // Specialisations for non-standard data-types template <> double SquaredDifference(const float2 val1, const float2 val2) { const auto real = SquaredDifference(val1.real(), val2.real()); const auto imag = SquaredDifference(val1.imag(), val2.imag()); return real + imag; } template <> double SquaredDifference(const double2 val1, const double2 val2) { const auto real = SquaredDifference(val1.real(), val2.real()); const auto imag = SquaredDifference(val1.imag(), val2.imag()); return real + imag; } template <> double SquaredDifference(const half val1, const half val2) { return SquaredDifference(HalfToFloat(val1), HalfToFloat(val2)); } // ================================================================================================= // High-level info std::string GetDeviceType(const Device& device) { return device.Type(); } std::string GetDeviceVendor(const Device& device) { auto device_vendor = device.Vendor(); for (auto &find_and_replace : device_mapping::kVendorNames) { // replacing to common names if (device_vendor == find_and_replace.first) { device_vendor = find_and_replace.second; } } return device_vendor; } // Mid-level info std::string GetDeviceArchitecture(const Device& device) { auto device_architecture = std::string{""}; #ifdef CUDA_API device_architecture = device.NVIDIAComputeCapability(); #else if (device.HasExtension(kKhronosAttributesNVIDIA)) { device_architecture = device.NVIDIAComputeCapability(); } else if (device.HasExtension(kKhronosAttributesAMD)) { device_architecture = device.Name(); // Name is architecture for AMD APP and AMD ROCm } else if ((device.IsQualcomm() && device.IsGPU())) { // queries the Adreno GPU architecture version device_architecture = device.AdrenoVersion(); } // Note: no else - 'device_architecture' might be the empty string #endif for (auto &find_and_replace : device_mapping::kArchitectureNames) { // replacing to common names if (device_architecture == find_and_replace.first) { device_architecture = find_and_replace.second; } } return device_architecture; } // Lowest-level std::string GetDeviceName(const Device& device) { auto device_name = std::string{""}; if (device.HasExtension(kKhronosAttributesAMD)) { device_name = device.AMDBoardName(); } else { device_name = device.Name(); } for (auto &find_and_replace : device_mapping::kDeviceNames) { // replacing to common names if (device_name == find_and_replace.first) { device_name = find_and_replace.second; } } for (auto &removal : device_mapping::kDeviceRemovals) { // removing certain things if (device_name.find(removal) != std::string::npos) { auto start_position_to_erase = device_name.find(removal); device_name.erase(start_position_to_erase, removal.length()); } } return device_name; } // ================================================================================================= void SetOpenCLKernelStandard(const Device &device, std::vector &options) { // Inclusion of one of the following extensions needs OpenCL 1.2 kernels if (device.HasExtension(kKhronosIntelSubgroups)) { options.push_back("-cl-std=CL1.2"); } // Otherwise we fall-back to the default CLBlast OpenCL 1.1 else { options.push_back("-cl-std=CL1.1"); } } // ================================================================================================= // Solve Bezout's identity // a * p + b * q = r = GCD(a, b) void EuclidGCD(int a, int b, int &p, int &q, int &r) { p = 0; q = 1; int p_1 = 1; int q_1 = 0; for (;;) { const int c = a % b; if (c == 0) { break; } const int p_2 = p_1; const int q_2 = q_1; p_1 = p; q_1 = q; p = p_2 - p_1 * (a / b); q = q_2 - q_1 * (a / b); a = b; b = c; } r = b; } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/src/utilities/utilities.hpp000066400000000000000000000331271463263031500203440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file provides declarations for the common utility functions such as a command-line // argument parser. On top of this, it serves as the 'common' header, including the C++ OpenCL // wrapper. // // ================================================================================================= #ifndef CLBLAST_UTILITIES_H_ #define CLBLAST_UTILITIES_H_ #include #include #include #include #include #include #ifdef OPENCL_API #include "clpp11.hpp" #include "clblast.h" #elif CUDA_API #include "cupp11.hpp" #include "clblast_cuda.h" #endif #include "clblast_half.h" #include "utilities/clblast_exceptions.hpp" #include "utilities/msvc.hpp" namespace clblast { // ================================================================================================= // Shorthands for half-precision using half = unsigned short; // the 'cl_half' OpenCL type is actually an 'unsigned short' // Shorthands for complex data-types using float2 = std::complex; using double2 = std::complex; // Khronos OpenCL extensions const std::string kKhronosAttributesAMD = "cl_amd_device_attribute_query"; const std::string kKhronosAttributesNVIDIA = "cl_nv_device_attribute_query"; const std::string kKhronosIntelSubgroups = "cl_intel_subgroups"; // Catched an unknown error constexpr auto kUnknownError = -999; // Canary size to add to buffers to check for buffer overflows constexpr auto kCanarySize = 127; // ================================================================================================= // The routine-specific arguments in string form constexpr auto kArgM = "m"; constexpr auto kArgN = "n"; constexpr auto kArgK = "k"; constexpr auto kArgKL = "kl"; constexpr auto kArgKU = "ku"; constexpr auto kArgLayout = "layout"; constexpr auto kArgATransp = "transA"; constexpr auto kArgBTransp = "transB"; constexpr auto kArgSide = "side"; constexpr auto kArgTriangle = "triangle"; constexpr auto kArgDiagonal = "diagonal"; constexpr auto kArgKernelMode = "kernel_mode"; constexpr auto kArgXInc = "incx"; constexpr auto kArgYInc = "incy"; constexpr auto kArgXOffset = "offx"; constexpr auto kArgYOffset = "offy"; constexpr auto kArgALeadDim = "lda"; constexpr auto kArgBLeadDim = "ldb"; constexpr auto kArgCLeadDim = "ldc"; constexpr auto kArgAOffset = "offa"; constexpr auto kArgBOffset = "offb"; constexpr auto kArgCOffset = "offc"; constexpr auto kArgAPOffset = "offap"; constexpr auto kArgDotOffset = "offdot"; constexpr auto kArgNrm2Offset = "offnrm2"; constexpr auto kArgAsumOffset = "offasum"; constexpr auto kArgImaxOffset = "offimax"; constexpr auto kArgAlpha = "alpha"; constexpr auto kArgBeta = "beta"; constexpr auto kArgBatchCount = "batch_num"; constexpr auto kArgNumKernels = "num_kernels"; // Constants for im2col constexpr auto kArgChannels = "channels"; constexpr auto kArgHeight = "height"; constexpr auto kArgWidth = "width"; constexpr auto kArgKernelH = "kernelh"; constexpr auto kArgKernelW = "kernelw"; constexpr auto kArgPadH = "padh"; constexpr auto kArgPadW = "padw"; constexpr auto kArgStrideH = "strideh"; constexpr auto kArgStrideW = "stridew"; constexpr auto kArgDilationH = "dilationh"; constexpr auto kArgDilationW = "dilationw"; // The tuner-specific arguments in string form constexpr auto kArgFraction = "fraction"; constexpr auto kArgHeuristicSelection = "heuristic"; constexpr auto kArgMaxL2Norm = "max_l2_norm"; // PSO tuner-specific arguments in string form constexpr auto kArgPsoSwarmSize = "pso_swarm_size"; constexpr auto kArgPsoInfGlobal = "pso_inf_global"; constexpr auto kArgPsoInfLocal = "pso_inf_local"; constexpr auto kArgPsoInfRandom = "pso_inf_random"; // Annealing tuner-specific arguments in string form constexpr auto kArgAnnMaxTemp = "ann_max_temperature"; // The common arguments in string form constexpr auto kArgPlatform = "platform"; constexpr auto kArgDevice = "device"; constexpr auto kArgPrecision = "precision"; constexpr auto kArgHelp = "h"; constexpr auto kArgQuiet = "q"; constexpr auto kArgNoAbbreviations = "no_abbrv"; constexpr auto kArgNumRuns = "runs"; constexpr auto kArgFullStatistics = "full_statistics"; // The buffer names constexpr auto kBufVecX = "X"; constexpr auto kBufVecY = "Y"; constexpr auto kBufMatA = "A"; constexpr auto kBufMatB = "B"; constexpr auto kBufMatC = "C"; constexpr auto kBufMatAP = "AP"; constexpr auto kBufScalar = "Scalar"; constexpr auto kBufScalarUint = "ScalarUint"; // ================================================================================================= #ifdef VERBOSE inline void log_debug(const std::string &log_string) { printf("[DEBUG] %s\n", log_string.c_str()); } #else inline void log_debug(const std::string&) { } #endif // ================================================================================================= // Converts a regular or complex type to it's base type (e.g. float2 to float) template struct BaseType { using Type = T; }; template <> struct BaseType { using Type = float; }; template <> struct BaseType { using Type = double; }; // ================================================================================================= // Returns a scalar with a default value template T GetScalar(); // Fixed value scalars template T ConstantZero(); template T ConstantOne(); template T ConstantNegOne(); template T Constant(const double val); template T SmallConstant(); // Returns the absolute value of a scalar (modulus in case of complex numbers) template typename BaseType::Type AbsoluteValue(const T value); // ================================================================================================= // Structure containing all possible arguments for test clients, including their default values template struct Arguments { // Routine-specific arguments size_t m = 1; size_t n = 1; size_t k = 1; size_t ku = 1; size_t kl = 1; Layout layout = Layout::kRowMajor; Transpose a_transpose = Transpose::kNo; Transpose b_transpose = Transpose::kNo; Side side = Side::kLeft; Triangle triangle = Triangle::kUpper; Diagonal diagonal = Diagonal::kUnit; KernelMode kernel_mode = KernelMode::kCrossCorrelation; size_t x_inc = 1; size_t y_inc = 1; size_t x_offset = 0; size_t y_offset = 0; size_t a_ld = 1; size_t b_ld = 1; size_t c_ld = 1; size_t a_offset = 0; size_t b_offset = 0; size_t c_offset = 0; size_t ap_offset = 0; size_t dot_offset = 0; size_t nrm2_offset = 0; size_t asum_offset = 0; size_t imax_offset = 0; T alpha = ConstantOne(); T beta = ConstantOne(); // Arguments for im2col and convgemm size_t channels = 1; size_t height = 1; size_t width = 1; size_t kernel_h = 3; size_t kernel_w = 3; size_t pad_h = 0; size_t pad_w = 0; size_t stride_h = 1; size_t stride_w = 1; size_t dilation_h = 1; size_t dilation_w = 1; size_t num_kernels = 1; // Batch-specific arguments size_t batch_count = 1; std::vector x_offsets; // = {0}; std::vector y_offsets; // = {0}; std::vector a_offsets; // = {0}; std::vector b_offsets; // = {0}; std::vector c_offsets; // = {0}; std::vector alphas; // = {ConstantOne()}; std::vector betas; // = {ConstantOne()}; // Sizes size_t x_size = 1; size_t y_size = 1; size_t a_size = 1; size_t b_size = 1; size_t c_size = 1; size_t ap_size = 1; size_t scalar_size = 1; // Tuner-specific arguments size_t heuristic_selection = 0; double fraction = 1.0; size_t pso_swarm_size = 8; double pso_inf_global = 0.3; double pso_inf_local = 0.6; double pso_inf_random = 0.1; double ann_max_temperature = 1.0; // Is it a valid default value? // Client-specific arguments int compare_clblas = 1; int compare_cblas = 1; int compare_cublas = 1; size_t step = 1; size_t num_steps = 0; size_t num_runs = 10; std::vector tuner_files = {}; bool full_statistics = false; #ifdef CLBLAST_REF_CUBLAS void* cublas_handle; // cublasHandle_t #endif // Common arguments size_t platform_id = 0; size_t device_id = 0; Precision precision = Precision::kSingle; bool print_help = false; bool silent = false; bool no_abbrv = false; }; // ================================================================================================= // Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast // data-types such as the Layout and Transpose data-types. template std::string ToString(T value); // ================================================================================================= // String splitting by a delimiter template void split(const std::string &s, char delimiter, Out result) { std::stringstream ss(s); std::string item; while (std::getline(ss, item, delimiter)) { *(result++) = item; } } // See above inline std::vector split(const std::string &s, char delimiter) { std::vector elements; split(s, delimiter, std::back_inserter(elements)); return elements; } // String character removal inline void remove_character(std::string &str, char to_be_removed) { str.erase(std::remove(str.begin(), str.end(), to_be_removed), str.end()); } // ================================================================================================= // Parses command-line and environmental-variable arguments into a std::vector of strings std::vector RetrieveCommandLineArguments(int argc, char *argv[]); // Helper for the function "GetArgument" template T ConvertArgument(const char* value); // Variant of "ConvertArgument" with default values template T ConvertArgument(const char* value, T default_value); // Basic argument parser, matching patterns in the form of "-option value" and "--option value" template T GetArgument(const std::vector &arguments, std::string &help, const std::string &option, const T default_value); // Returns the precision only Precision GetPrecision(const std::vector &arguments, const Precision default_precision = Precision::kSingle); // As in "GetArgument", but now only checks whether an argument is given or not bool CheckArgument(const std::vector &arguments, std::string &help, const std::string &option); // ================================================================================================= // Test/example data lower and upper limit constexpr auto kTestDataLowerLimit = -2.0; constexpr auto kTestDataUpperLimit = 2.0; // Populates a vector with random data template void PopulateVector(std::vector &vector, std::mt19937 &mt, std::uniform_real_distribution &dist); // ================================================================================================= // Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is // no conversion, but half-precision is not supported as kernel argument so it is converted to float. template struct RealArg { using Type = T; }; template <> struct RealArg { using Type = float; }; template typename RealArg::Type GetRealArg(const T value); // ================================================================================================= // Rounding functions size_t CeilDiv(const size_t x, const size_t y); size_t Ceil(const size_t x, const size_t y); // Returns whether or not 'a' is a multiple of 'b' bool IsMultiple(const size_t a, const size_t b); // ================================================================================================= // Convert the precision enum into bytes, e.g. a double takes up 8 bytes size_t GetBytes(const Precision precision); // Convert the template argument into a precision value template Precision PrecisionValue(); // ================================================================================================= // Returns false is this precision is not supported by the device template bool PrecisionSupported(const Device &device); // ================================================================================================= // Retrieves the squared difference, used for example for computing the L2 error template double SquaredDifference(const T val1, const T val2); // ================================================================================================= // Device information in a specific CLBlast form std::string GetDeviceType(const Device& device); std::string GetDeviceVendor(const Device& device); std::string GetDeviceArchitecture(const Device& device); std::string GetDeviceName(const Device& device); // ================================================================================================= void SetOpenCLKernelStandard(const Device &device, std::vector &options); // ================================================================================================= // Solve Bezout's identity // a * p + b * q = r = GCD(a, b) void EuclidGCD(int a, int b, int &p, int &q, int &r); // ================================================================================================= } // namespace clblast // CLBLAST_UTILITIES_H_ #endif CLBlast-1.6.3/test/000077500000000000000000000000001463263031500137675ustar00rootroot00000000000000CLBlast-1.6.3/test/correctness/000077500000000000000000000000001463263031500163215ustar00rootroot00000000000000CLBlast-1.6.3/test/correctness/misc/000077500000000000000000000000001463263031500172545ustar00rootroot00000000000000CLBlast-1.6.3/test/correctness/misc/override_parameters.cpp000066400000000000000000000161341463263031500240270ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the tests for the OverrideParameters function // // ================================================================================================= #include #include #include #include #include #include "utilities/utilities.hpp" #include "test/routines/level3/xgemm.hpp" namespace clblast { // ================================================================================================= template size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::string &routine_name) { auto arguments = RetrieveCommandLineArguments(argc, argv); auto errors = size_t{0}; auto passed = size_t{0}; auto example_routine = TestXgemm<0, T>(); constexpr auto kSeed = 42; // fixed seed for reproducibility // Determines the test settings const auto kernel_name = std::string{"Xgemm"}; const auto precision = PrecisionValue(); const auto valid_settings = std::vector>{ { {"GEMMK",0}, {"KREG",1}, {"KWG",16}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} }, { {"GEMMK",0}, {"KREG",1}, {"KWG",32}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",32}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} }, { {"GEMMK",0}, {"KREG",1}, {"KWG",16}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} }, }; const auto invalid_settings = std::vector>{ { {"GEMMK",0}, {"KREG",1}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0} }, }; // Retrieves the arguments auto help = std::string{"Options given/available:\n"}; const auto platform_id = GetArgument(arguments, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); const auto device_id = GetArgument(arguments, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); auto args = Arguments{}; args.m = GetArgument(arguments, help, kArgM, size_t{256}); args.n = GetArgument(arguments, help, kArgN, size_t{256}); args.k = GetArgument(arguments, help, kArgK, size_t{256}); args.a_ld = GetArgument(arguments, help, kArgALeadDim, args.k); args.b_ld = GetArgument(arguments, help, kArgBLeadDim, args.n); args.c_ld = GetArgument(arguments, help, kArgCLeadDim, args.n); args.a_offset = GetArgument(arguments, help, kArgAOffset, size_t{0}); args.b_offset = GetArgument(arguments, help, kArgBOffset, size_t{0}); args.c_offset = GetArgument(arguments, help, kArgCOffset, size_t{0}); args.layout = GetArgument(arguments, help, kArgLayout, Layout::kRowMajor); args.a_transpose = GetArgument(arguments, help, kArgATransp, Transpose::kNo); args.b_transpose = GetArgument(arguments, help, kArgBTransp, Transpose::kNo); args.kernel_mode = GetArgument(arguments, help, kArgKernelMode, KernelMode::kCrossCorrelation); args.alpha = GetArgument(arguments, help, kArgAlpha, GetScalar()); args.beta = GetArgument(arguments, help, kArgBeta, GetScalar()); // Prints the help message (command-line arguments) if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); } // Initializes OpenCL const auto platform = Platform(platform_id); const auto device = Device(platform, device_id); const auto context = Context(device); auto queue = Queue(context, device); // Populate host matrices with some example data auto host_a = std::vector(args.m * args.k); auto host_b = std::vector(args.n * args.k); auto host_c = std::vector(args.m * args.n); std::mt19937 mt(kSeed); std::uniform_real_distribution dist(kTestDataLowerLimit, kTestDataUpperLimit); PopulateVector(host_a, mt, dist); PopulateVector(host_b, mt, dist); PopulateVector(host_c, mt, dist); // Copy the matrices to the device auto device_a = Buffer(context, host_a.size()); auto device_b = Buffer(context, host_b.size()); auto device_c = Buffer(context, host_c.size()); auto device_temp = Buffer(context, args.m * args.n * args.k); // just to be safe device_a.Write(queue, host_a.size(), host_a); device_b.Write(queue, host_b.size(), host_b); device_c.Write(queue, host_c.size(), host_c); auto dummy = Buffer(context, 1); auto dummy_scalar = Buffer(context, 1); auto buffers = Buffers{dummy, dummy, device_a, device_b, device_c, device_temp, dummy, dummy_scalar}; // Loops over the valid combinations: run before and run afterwards fprintf(stdout, "* Testing OverrideParameters for '%s'\n", routine_name.c_str()); for (const auto &override_setting : valid_settings) { const auto status_before = example_routine.RunRoutine(args, buffers, queue); if (status_before != StatusCode::kSuccess) { errors++; continue; } // Overrides the parameters const auto status = OverrideParameters(device(), kernel_name, precision, override_setting); if (status != StatusCode::kSuccess) { errors++; continue; } // error shouldn't occur const auto status_after = example_routine.RunRoutine(args, buffers, queue); if (status_after != StatusCode::kSuccess) { errors++; continue; } passed++; } // Loops over the invalid combinations: run before and run afterwards for (const auto &override_setting : invalid_settings) { const auto status_before = example_routine.RunRoutine(args, buffers, queue); if (status_before != StatusCode::kSuccess) { errors++; continue; } // Overrides the parameters const auto status = OverrideParameters(device(), kernel_name, precision, override_setting); if (status == StatusCode::kSuccess) { errors++; continue; } // error should occur const auto status_after = example_routine.RunRoutine(args, buffers, queue); if (status_after != StatusCode::kSuccess) { errors++; continue; } passed++; } // Prints and returns the statistics std::cout << " " << passed << " test(s) passed" << std::endl; std::cout << " " << errors << " test(s) failed" << std::endl; std::cout << std::endl; return errors; } // ================================================================================================= } // namespace clblast // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunOverrideTests(argc, argv, false, "SGEMM"); errors += clblast::RunOverrideTests(argc, argv, true, "CGEMM"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/misc/preprocessor.cpp000066400000000000000000000243251463263031500225140ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the tests for the simple integrated OpenCL pre-processor // // ================================================================================================= #include #include #include #include #include #include "utilities/utilities.hpp" #include "utilities/compile.hpp" #include "kernel_preprocessor.hpp" namespace clblast { // ================================================================================================= bool TestDefines() { const auto source1 = R"( #define VAR1 #define VAR2 32 #define VAR3 #if VAR2 == 32 #ifndef VAR1 #define ERROR1 #ifdef VAR1 #define ERROR2 #endif #else #if VAR2 == 32 || VAR3 == 4 #define SUCCESS1 #else #define ERROR3 #endif #define SUCCESS2 #endif #endif #ifndef VAR4 #define SUCCESS3 #else #define ERROR4 #endif #if defined(VAR1) && !defined(VAR3) #define ERROR5 #endif #if defined(VAR1) && defined(VAR3) #define SUCCESS4 #endif #if defined(VAR1) && !defined(VAR4) #define SUCCESS5 #endif #if defined(VAR1) && defined(VAR4) #define ERROR6 #endif )"; const auto expected1 = " #define VAR1\n" " #define VAR2 32\n" " #define VAR3\n" " #define SUCCESS1\n" " #define SUCCESS2\n" " #define SUCCESS3\n" " #define SUCCESS4\n" " #define SUCCESS5\n" " \n"; const auto result1 = PreprocessKernelSource(source1); if (result1 == expected1) { return true; } else { fprintf(stdout, "* ERROR: Pre-processor TestDefines error, got:"); fprintf(stdout, "%s", result1.c_str()); return false; } } // ================================================================================================= bool TestArrayToRegisterPromotion() { const auto source1 = R"(#define WPT 2 inline void SetValues(int float, float values[WPT], const float k) { #pragma unroll for (int i = 0; i < WPT; i += 1) { values[i] = k + j; } } __kernel void ExampleKernel() { #pragma promote_to_registers float values[WPT]; #pragma unroll for (int i = 0; i < WPT; i += 1) { values[i] = 0.0f; } SetValues(12.3f, values, -3.9f); } )"; const auto expected1 = R"(#define WPT 2 inline void SetValues(int float, float values_0, float values_1, const float k) { { values_0 = k + j; } { values_1 = k + j; } } __kernel void ExampleKernel() { float values_0; float values_1; { values_0 = 0.0f; } { values_1 = 0.0f; } SetValues(12.3f, values_0, values_1, -3.9f); } )"; const auto result1 = PreprocessKernelSource(source1); if (result1 == expected1) { return true; } else { fprintf(stdout, "* ERROR: Pre-processor TestArrayToRegisterPromotion error"); return false; } } // ================================================================================================= bool TestKernel(const Device& device, const Context& context, const std::string &kernel_name, const std::string &kernel_source, const Precision precision) { fprintf(stdout, "* Testing simple OpenCL pre-processor for '%s'\n", kernel_name.c_str()); // Verifies that the current kernel compiles properly (assumes so, otherwise throws an error) auto compiler_options_ref = std::vector(); const auto program_ref = CompileFromSource(kernel_source, precision, kernel_name, device, context, compiler_options_ref, 2); // Compiles the same kernel, but now with the pre-processor enabled try { auto compiler_options = std::vector(); const auto program = CompileFromSource(kernel_source, precision, kernel_name, device, context, compiler_options, 1); return true; } catch (const CLCudaAPIBuildError &e) { fprintf(stdout, "* ERROR: Compilation warnings/errors with pre-processed kernel, status %d\n", e.status()); return false; } catch (const Error &e) { fprintf(stdout, "* ERROR: Pre-processor error, message:\n%s\n", e.what()); return false; } } // ================================================================================================= size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precision precision) { auto errors = size_t{0}; auto passed = size_t{0}; // Retrieves the arguments auto help = std::string{"Options given/available:\n"}; auto arguments = RetrieveCommandLineArguments(argc, argv); const auto platform_id = GetArgument(arguments, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); const auto device_id = GetArgument(arguments, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); } // Initializes OpenCL const auto platform = Platform(platform_id); const auto device = Device(platform, device_id); const auto context = Context(device); // Basic tests if (TestDefines()) { passed++; } else { errors++; } if (TestArrayToRegisterPromotion()) { passed++; } else { errors++; } // XAXPY const auto xaxpy_sources = "#define WPT 2\n" #include "../src/kernels/level1/level1.opencl" #include "../src/kernels/level1/xaxpy.opencl" ; if (TestKernel(device, context, "XaxpyFastest", xaxpy_sources, precision)) { passed++; } else { errors++; } // XGER const auto xger_sources = "#define WPT 2\n" #include "../src/kernels/level2/level2.opencl" #include "../src/kernels/level2/xger.opencl" ; if (TestKernel(device, context, "Xger", xger_sources, precision)) { passed++; } else { errors++; } // XGEMV const auto xgemv_sources = "#define WPT1 2\n" "#define WPT2 2\n" "#define WPT3 2\n" "#define UNROLL1 4\n" "#define VW2 2\n" #include "../src/kernels/level2/xgemv.opencl" #include "../src/kernels/level2/xgemv_fast.opencl" ; if (TestKernel(device, context, "XgemvFast", xgemv_sources, precision)) { passed++; } else { errors++; } // CopyFast const auto copy_fast_sources = "#define COPY_WPT 2\n" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/copy_fast.opencl" ; if (TestKernel(device, context, "CopyMatrixFast", copy_fast_sources, precision)) { passed++; } else { errors++; } // CopyPad const auto copy_pad_sources = "#define PAD_WPTX 2\n" "#define PAD_WPTY 2\n" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/copy_pad.opencl" ; if (TestKernel(device, context, "CopyPadMatrix", copy_pad_sources, precision)) { passed++; } else { errors++; } // TransposeFast const auto transpose_fast_sources = "#define TRA_WPT 2\n" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/transpose_fast.opencl" ; if (TestKernel(device, context, "TransposeMatrixFast", transpose_fast_sources, precision)) { passed++; } else { errors++; } // TransposePad const auto transpose_pad_sources = "#define PADTRA_WPT 2\n" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/transpose_pad.opencl" ; if (TestKernel(device, context, "TransposePadMatrix", transpose_pad_sources, precision)) { passed++; } else { errors++; } // GEMM (in-direct) GEMMK==0 const auto gemm_sources = "#define KWI 2\n" "#define MWG 16\n" "#define NWG 16\n" "#define SA 1\n" #include "../src/kernels/level3/xgemm_part1.opencl" #include "../src/kernels/level3/xgemm_part2.opencl" #include "../src/kernels/level3/xgemm_part3.opencl" #include "../src/kernels/level3/xgemm_part4.opencl" ; if (TestKernel(device, context, "Xgemm", gemm_sources, precision)) { passed++; } else { errors++; } // GEMM (in-direct) GEMMK==1 const auto gemm_sources_gemmk1 = "#define MWG 16\n" "#define NWG 16\n" "#define GEMMK 1\n" #include "../src/kernels/level3/xgemm_part1.opencl" #include "../src/kernels/level3/xgemm_part2.opencl" #include "../src/kernels/level3/xgemm_part3.opencl" #include "../src/kernels/level3/xgemm_part4.opencl" ; if (TestKernel(device, context, "Xgemm", gemm_sources_gemmk1, precision)) { passed++; } else { errors++; } // GEMM (direct) const auto gemm_direct_sources = "#define KWID 2\n" "#define WGD 16\n" #include "../src/kernels/level3/xgemm_direct_part1.opencl" #include "../src/kernels/level3/xgemm_direct_part2.opencl" #include "../src/kernels/level3/xgemm_direct_part3.opencl" ; if (TestKernel(device, context, "XgemmDirectTN", gemm_direct_sources, precision)) { passed++; } else { errors++; } // HEMM if (precision == Precision::kComplexSingle || precision == Precision::kComplexDouble) { const auto herm_sources = "#define ROUTINE_HEMM\n" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/convert_hermitian.opencl" ; if (TestKernel(device, context, "HermLowerToSquared", herm_sources, precision)) { passed++; } else { errors++; } } // Prints and returns the statistics std::cout << std::endl; std::cout << " " << passed << " test(s) passed" << std::endl; std::cout << " " << errors << " test(s) failed" << std::endl; std::cout << std::endl; return errors; } // ================================================================================================= } // namespace clblast // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunPreprocessor(argc, argv, false, clblast::Precision::kSingle); errors += clblast::RunPreprocessor(argc, argv, true, clblast::Precision::kComplexDouble); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/misc/retrieve_parameters.cpp000066400000000000000000000066521463263031500240410ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains the tests for the RetrieveParameters function // // ================================================================================================= #include #include #include #include #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= template size_t RunRetrieveParametersTests(int argc, char *argv[], const bool silent, const std::string &routine_name) { auto arguments = RetrieveCommandLineArguments(argc, argv); auto errors = size_t{0}; auto passed = size_t{0}; // Retrieves the arguments auto help = std::string{"Options given/available:\n"}; const auto platform_id = GetArgument(arguments, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); const auto device_id = GetArgument(arguments, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); auto args = Arguments{}; // Determines the test settings const auto kernel_name = std::string{"Xgemm"}; const auto expected_parameters = std::vector{ "KWG", "KWI", "MDIMA", "MDIMC", "MWG", "NDIMB", "NDIMC", "NWG", "SA", "SB", "STRM", "STRN", "VWM", "VWN" }; const auto expected_max_value = size_t{16384}; // Prints the help message (command-line arguments) if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); } // Initializes OpenCL const auto platform = Platform(platform_id); const auto device = Device(platform, device_id); // Retrieves the parameters fprintf(stdout, "* Testing RetrieveParameters for '%s'\n", routine_name.c_str()); auto parameters = std::unordered_map(); const auto status = RetrieveParameters(device(), kernel_name, PrecisionValue(), parameters); if (status != StatusCode::kSuccess) { errors++; } // Verifies the parameters for (const auto &expected_parameter : expected_parameters) { if (parameters.find(expected_parameter) != parameters.end()) { const auto value = parameters[expected_parameter]; if (value < expected_max_value) { passed++; } else { errors++; } //std::cout << expected_parameter << " = " << value << std::endl; } else { errors++; } } // Prints and returns the statistics std::cout << " " << passed << " test(s) passed" << std::endl; std::cout << " " << errors << " test(s) failed" << std::endl; std::cout << std::endl; return errors; } // ================================================================================================= } // namespace clblast // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunRetrieveParametersTests(argc, argv, false, "SGEMM"); errors += clblast::RunRetrieveParametersTests(argc, argv, true, "CGEMM"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/000077500000000000000000000000001463263031500201715ustar00rootroot00000000000000CLBlast-1.6.3/test/correctness/routines/level1/000077500000000000000000000000001463263031500213615ustar00rootroot00000000000000CLBlast-1.6.3/test/correctness/routines/level1/xamax.cpp000066400000000000000000000026501463263031500232060ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xamax.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "iSAMAX"); errors += clblast::RunTests, double, double>(argc, argv, true, "iDAMAX"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "iCAMAX"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "iZAMAX"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "iHAMAX"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xasum.cpp000066400000000000000000000026451463263031500232310ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xasum.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SASUM"); errors += clblast::RunTests, double, double>(argc, argv, true, "DASUM"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "ScASUM"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "DzASUM"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HASUM"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xaxpy.cpp000066400000000000000000000026431463263031500232430ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xaxpy.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SAXPY"); errors += clblast::RunTests, double, double>(argc, argv, true, "DAXPY"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CAXPY"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZAXPY"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HAXPY"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xcopy.cpp000066400000000000000000000026431463263031500232340ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xcopy.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SCOPY"); errors += clblast::RunTests, double, double>(argc, argv, true, "DCOPY"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CCOPY"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZCOPY"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HCOPY"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xdot.cpp000066400000000000000000000022271463263031500230460ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xdot.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SDOT"); errors += clblast::RunTests, double, double>(argc, argv, true, "DDOT"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HDOT"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xdotc.cpp000066400000000000000000000021371463263031500232110ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xdotc.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, false, "CDOTC"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZDOTC"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xdotu.cpp000066400000000000000000000021371463263031500232330ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xdotu.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, false, "CDOTU"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZDOTU"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xnrm2.cpp000066400000000000000000000026451463263031500231420ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xnrm2.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SNRM2"); errors += clblast::RunTests, double, double>(argc, argv, true, "DNRM2"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "ScNRM2"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "DzNRM2"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HNRM2"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xrot.cpp000066400000000000000000000020361463263031500230620ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xrot.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SROT"); errors += clblast::RunTests, double, double>(argc, argv, true, "DROT"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xrotg.cpp000066400000000000000000000020431463263031500232270ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xrotg.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SROTG"); errors += clblast::RunTests, double, double>(argc, argv, true, "DROTG"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xrotm.cpp000066400000000000000000000020431463263031500232350ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xrotm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SROTM"); errors += clblast::RunTests, double, double>(argc, argv, true, "DROTM"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xrotmg.cpp000066400000000000000000000020501463263031500234020ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xrotmg.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SROTMG"); errors += clblast::RunTests, double, double>(argc, argv, true, "DROTMG"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xscal.cpp000066400000000000000000000026431463263031500232040ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xscal.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SSCAL"); errors += clblast::RunTests, double, double>(argc, argv, true, "DSCAL"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CSCAL"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZSCAL"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HSCAL"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level1/xswap.cpp000066400000000000000000000026431463263031500232340ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level1/xswap.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SSWAP"); errors += clblast::RunTests, double, double>(argc, argv, true, "DSWAP"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CSWAP"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZSWAP"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HSWAP"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/000077500000000000000000000000001463263031500213625ustar00rootroot00000000000000CLBlast-1.6.3/test/correctness/routines/level2/xgbmv.cpp000066400000000000000000000026431463263031500232160ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xgbmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SGBMV"); errors += clblast::RunTests, double, double>(argc, argv, true, "DGBMV"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CGBMV"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZGBMV"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HGBMV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xgemv.cpp000066400000000000000000000026431463263031500232210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xgemv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SGEMV"); errors += clblast::RunTests, double, double>(argc, argv, true, "DGEMV"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CGEMV"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMV"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HGEMV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xger.cpp000066400000000000000000000022271463263031500230360ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xger.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SGER"); errors += clblast::RunTests, double, double>(argc, argv, true, "DGER"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HGER"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xgerc.cpp000066400000000000000000000021371463263031500232010ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xgerc.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, false, "CGERC"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZGERC"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xgeru.cpp000066400000000000000000000021371463263031500232230ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xgeru.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, false, "CGERU"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZGERU"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xhbmv.cpp000066400000000000000000000021371463263031500232150ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xhbmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, false, "CHBMV"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZHBMV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xhemv.cpp000066400000000000000000000021371463263031500232200ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xhemv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, false, "CHEMV"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZHEMV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xher.cpp000066400000000000000000000021231463263031500230320ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xher.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, float>(argc, argv, false, "CHER"); errors += clblast::RunTests, clblast::double2, double>(argc, argv, true, "ZHER"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xher2.cpp000066400000000000000000000021371463263031500231210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xher2.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, false, "CHER2"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZHER2"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xhpmv.cpp000066400000000000000000000021371463263031500232330ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xhpmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, false, "CHPMV"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZHPMV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xhpr.cpp000066400000000000000000000021231463263031500230450ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xhpr.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, float>(argc, argv, false, "CHPR"); errors += clblast::RunTests, clblast::double2, double>(argc, argv, true, "ZHPR"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xhpr2.cpp000066400000000000000000000021371463263031500231340ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xhpr2.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, false, "CHPR2"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZHPR2"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xsbmv.cpp000066400000000000000000000022361463263031500232300ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xsbmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SSBMV"); errors += clblast::RunTests, double, double>(argc, argv, true, "DSBMV"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HSBMV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xspmv.cpp000066400000000000000000000022361463263031500232460ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xspmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SSPMV"); errors += clblast::RunTests, double, double>(argc, argv, true, "DSPMV"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HSPMV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xspr.cpp000066400000000000000000000022271463263031500230650ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xspr.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SSPR"); errors += clblast::RunTests, double, double>(argc, argv, true, "DSPR"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HSPR"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xspr2.cpp000066400000000000000000000022361463263031500231470ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xspr2.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SSPR2"); errors += clblast::RunTests, double, double>(argc, argv, true, "DSPR2"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HSPR2"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xsymv.cpp000066400000000000000000000022361463263031500232570ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xsymv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SSYMV"); errors += clblast::RunTests, double, double>(argc, argv, true, "DSYMV"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HSYMV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xsyr.cpp000066400000000000000000000022271463263031500230760ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xsyr.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SSYR"); errors += clblast::RunTests, double, double>(argc, argv, true, "DSYR"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HSYR"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xsyr2.cpp000066400000000000000000000022361463263031500231600ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xsyr2.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SSYR2"); errors += clblast::RunTests, double, double>(argc, argv, true, "DSYR2"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HSYR2"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xtbmv.cpp000066400000000000000000000026431463263031500232330ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xtbmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "STBMV"); errors += clblast::RunTests, double, double>(argc, argv, true, "DTBMV"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CTBMV"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZTBMV"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HTBMV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xtbsv.cpp000066400000000000000000000024501463263031500232350ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xtbsv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "STBSV"); errors += clblast::RunTests, double, double>(argc, argv, true, "DTBSV"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CTBSV"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZTBSV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xtpmv.cpp000066400000000000000000000026431463263031500232510ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xtpmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "STPMV"); errors += clblast::RunTests, double, double>(argc, argv, true, "DTPMV"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CTPMV"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZTPMV"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HTPMV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xtpsv.cpp000066400000000000000000000024501463263031500232530ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xtpsv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "STPSV"); errors += clblast::RunTests, double, double>(argc, argv, true, "DTPSV"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CTPSV"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZTPSV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xtrmv.cpp000066400000000000000000000026431463263031500232530ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xtrmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "STRMV"); errors += clblast::RunTests, double, double>(argc, argv, true, "DTRMV"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CTRMV"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZTRMV"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HTRMV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level2/xtrsv.cpp000066400000000000000000000024501463263031500232550ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level2/xtrsv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "STRSV"); errors += clblast::RunTests, double, double>(argc, argv, true, "DTRSV"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CTRSV"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZTRSV"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level3/000077500000000000000000000000001463263031500213635ustar00rootroot00000000000000CLBlast-1.6.3/test/correctness/routines/level3/xgemm.cpp000066400000000000000000000040121463263031500232010ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level3/xgemm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SGEMM"); errors += clblast::RunTests, double, double>(argc, argv, true, "DGEMM"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HGEMM"); errors += clblast::RunTests, float, float>(argc, argv, true, "SGEMM"); errors += clblast::RunTests, double, double>(argc, argv, true, "DGEMM"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HGEMM"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level3/xhemm.cpp000066400000000000000000000021371463263031500232100ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level3/xhemm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, false, "CHEMM"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZHEMM"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level3/xher2k.cpp000066400000000000000000000021351463263031500232730ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level3/xher2k.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, float>(argc, argv, false, "CHER2K"); errors += clblast::RunTests, clblast::double2, double>(argc, argv, true, "ZHER2K"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level3/xherk.cpp000066400000000000000000000021301463263031500232040ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level3/xherk.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, clblast::float2, float>(argc, argv, false, "CHERK"); errors += clblast::RunTests, clblast::double2, double>(argc, argv, true, "ZHERK"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level3/xsymm.cpp000066400000000000000000000026431463263031500232510ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level3/xsymm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SSYMM"); errors += clblast::RunTests, double, double>(argc, argv, true, "DSYMM"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CSYMM"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZSYMM"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HSYMM"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level3/xsyr2k.cpp000066400000000000000000000026561463263031500233420ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level3/xsyr2k.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SSYR2K"); errors += clblast::RunTests, double, double>(argc, argv, true, "DSYR2K"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CSYR2K"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZSYR2K"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HSYR2K"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level3/xsyrk.cpp000066400000000000000000000026431463263031500232540ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level3/xsyrk.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SSYRK"); errors += clblast::RunTests, double, double>(argc, argv, true, "DSYRK"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CSYRK"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZSYRK"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HSYRK"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level3/xtrmm.cpp000066400000000000000000000026431463263031500232430ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level3/xtrmm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "STRMM"); errors += clblast::RunTests, double, double>(argc, argv, true, "DTRMM"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CTRMM"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZTRMM"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HTRMM"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/level3/xtrsm.cpp000066400000000000000000000024501463263031500232450ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/level3/xtrsm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "STRSM"); errors += clblast::RunTests, double, double>(argc, argv, true, "DTRSM"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CTRSM"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZTRSM"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/levelx/000077500000000000000000000000001463263031500214705ustar00rootroot00000000000000CLBlast-1.6.3/test/correctness/routines/levelx/xaxpybatched.cpp000066400000000000000000000027601463263031500246650ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/levelx/xaxpybatched.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SAXPYBATCHED"); errors += clblast::RunTests, double, double>(argc, argv, true, "DAXPYBATCHED"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CAXPYBATCHED"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZAXPYBATCHED"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HAXPYBATCHED"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/levelx/xcol2im.cpp000066400000000000000000000026711463263031500235570ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/levelx/xcol2im.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SCOL2IM"); errors += clblast::RunTests, double, double>(argc, argv, true, "DCOL2IM"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CCOL2IM"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZCOL2IM"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HCOL2IM"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/levelx/xconvgemm.cpp000066400000000000000000000022721463263031500242020ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/levelx/xconvgemm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SCONVGEMM"); errors += clblast::RunTests, double, double>(argc, argv, true, "DCONVGEMM"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HCONVGEMM"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/levelx/xgemmbatched.cpp000066400000000000000000000027601463263031500246310ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/levelx/xgemmbatched.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SGEMMBATCHED"); errors += clblast::RunTests, double, double>(argc, argv, true, "DGEMMBATCHED"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CGEMMBATCHED"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMMBATCHED"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HGEMMBATCHED"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/levelx/xgemmstridedbatched.cpp000066400000000000000000000030751463263031500262100ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/levelx/xgemmstridedbatched.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SGEMMSTRIDEDBATCHED"); errors += clblast::RunTests, double, double>(argc, argv, true, "DGEMMSTRIDEDBATCHED"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CGEMMSTRIDEDBATCHED"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMMSTRIDEDBATCHED"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HGEMMSTRIDEDBATCHED"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/levelx/xhad.cpp000066400000000000000000000026301463263031500231210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/levelx/xhad.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SHAD"); errors += clblast::RunTests, double, double>(argc, argv, true, "DHAD"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CHAD"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZHAD"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HHAD"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/levelx/xim2col.cpp000066400000000000000000000026711463263031500235570ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/levelx/xim2col.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SIM2COL"); errors += clblast::RunTests, double, double>(argc, argv, true, "DIM2COL"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "CIM2COL"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZIM2COL"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HIM2COL"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/levelx/xinvert.cpp000066400000000000000000000027211463263031500236750ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/levelx/xinvert.hpp" // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SINVERT"); errors += clblast::RunTests, double, double>(argc, argv, true, "DINVERT"); errors += clblast::RunTests, float2, float2>(argc, argv, true, "CINVERT"); errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZINVERT"); errors += clblast::RunTests, half, half>(argc, argv, true, "HINVERT"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/routines/levelx/xomatcopy.cpp000066400000000000000000000027171463263031500242260ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/correctness/testblas.hpp" #include "test/routines/levelx/xomatcopy.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests, float, float>(argc, argv, false, "SOMATCOPY"); errors += clblast::RunTests, double, double>(argc, argv, true, "DOMATCOPY"); errors += clblast::RunTests, clblast::float2, clblast::float2>(argc, argv, true, "COMATCOPY"); errors += clblast::RunTests, clblast::double2, clblast::double2>(argc, argv, true, "ZOMATCOPY"); errors += clblast::RunTests, clblast::half, clblast::half>(argc, argv, true, "HOMATCOPY"); if (errors > 0) { return 1; } else { return 0; } } // ================================================================================================= CLBlast-1.6.3/test/correctness/testblas.cpp000066400000000000000000000362401463263031500206530ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the TestBlas class (see the header for information about the class). // // ================================================================================================= #include #include #include #include "utilities/utilities.hpp" #include "test/correctness/testblas.hpp" namespace clblast { // ================================================================================================= // The transpose configurations to test with: template parameter dependent template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes}; template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes}; template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes}; template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate}; template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate}; template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kConjugate}; template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kConjugate}; // Constructor, initializes the base class tester and input data template TestBlas::TestBlas(const std::vector &arguments, const bool silent, const std::string &name, const std::vector &options, const DataPrepare prepare_data, const Routine run_routine, const Routine run_reference1, const Routine run_reference2, const ResultGet get_result, const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2): Tester(arguments, silent, name, options), kOffsets(GetOffsets()), kAlphaValues(GetExampleScalars(full_test_)), kBetaValues(GetExampleScalars(full_test_)), prepare_data_(prepare_data), run_routine_(run_routine), run_reference1_(run_reference1), run_reference2_(run_reference2), get_result_(get_result), get_index_(get_index), get_id1_(get_id1), get_id2_(get_id2) { // Sanity check if (!compare_clblas_ && !compare_cblas_) { throw std::runtime_error("Invalid configuration: no reference to test against"); } // Computes the maximum sizes. This allows for a single set of input/output buffers. const auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end()); const auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end()); const auto max_mat = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); const auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); const auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end()); const auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end()); const auto max_batch_count = *std::max_element(kBatchCounts.begin(), kBatchCounts.end()); // Creates test input data. Adds a 'canary' region to detect buffer overflows x_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset + kCanarySize); y_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset + kCanarySize); a_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset + kCanarySize); b_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset + kCanarySize); c_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset + kCanarySize); ap_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset + kCanarySize); scalar_source_.resize(max_batch_count * std::max(max_mat, max_matvec) + max_offset + kCanarySize); std::mt19937 mt(kSeed); std::uniform_real_distribution dist(kTestDataLowerLimit, kTestDataUpperLimit); PopulateVector(x_source_, mt, dist); PopulateVector(y_source_, mt, dist); PopulateVector(a_source_, mt, dist); PopulateVector(b_source_, mt, dist); PopulateVector(c_source_, mt, dist); PopulateVector(ap_source_, mt, dist); PopulateVector(scalar_source_, mt, dist); } // =============================================================================================== // Tests the routine for a wide variety of parameters template void TestBlas::TestRegular(std::vector> &test_vector, const std::string &name) { if (!PrecisionSupported(device_)) { return; } TestStart("regular behaviour", name); // Iterates over all the to-be-tested combinations of arguments for (auto &args: test_vector) { // Adds a 'canary' region to detect buffer overflows args.x_size += kCanarySize; args.y_size += kCanarySize; args.a_size += kCanarySize; args.b_size += kCanarySize; args.c_size += kCanarySize; args.ap_size += kCanarySize; args.scalar_size += kCanarySize; // Prints the current test configuration if (verbose_) { fprintf(stdout, " Testing: %s", GetOptionsString(args).c_str()); std::cout << std::flush; } // Optionally prepares the input data prepare_data_(args, queue_, kSeed, x_source_, y_source_, a_source_, b_source_, c_source_, ap_source_, scalar_source_); // Set-up for the CLBlast run auto x_vec2 = Buffer(context_, args.x_size); auto y_vec2 = Buffer(context_, args.y_size); auto a_mat2 = Buffer(context_, args.a_size); auto b_mat2 = Buffer(context_, args.b_size); auto c_mat2 = Buffer(context_, args.c_size); auto ap_mat2 = Buffer(context_, args.ap_size); auto scalar2 = Buffer(context_, args.scalar_size); auto scalar_uint2 = Buffer(context_, args.scalar_size); x_vec2.Write(queue_, args.x_size, x_source_); y_vec2.Write(queue_, args.y_size, y_source_); a_mat2.Write(queue_, args.a_size, a_source_); b_mat2.Write(queue_, args.b_size, b_source_); c_mat2.Write(queue_, args.c_size, c_source_); ap_mat2.Write(queue_, args.ap_size, ap_source_); scalar2.Write(queue_, args.scalar_size, scalar_source_); auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2, scalar_uint2}; // Runs CLBlast if (verbose_) { fprintf(stdout, "[CLBlast]"); std::cout << std::flush; } const auto status2 = run_routine_(args, buffers2, queue_); // Don't continue with CBLAS if there are incorrect parameters if (compare_cblas_ && status2 != StatusCode::kSuccess) { if (verbose_) { fprintf(stdout, " -> %d -> ", static_cast(status2)); std::cout << std::flush; } TestErrorCodes(status2, status2, args); continue; } // Set-up for the reference run auto x_vec1 = Buffer(context_, args.x_size); auto y_vec1 = Buffer(context_, args.y_size); auto a_mat1 = Buffer(context_, args.a_size); auto b_mat1 = Buffer(context_, args.b_size); auto c_mat1 = Buffer(context_, args.c_size); auto ap_mat1 = Buffer(context_, args.ap_size); auto scalar1 = Buffer(context_, args.scalar_size); auto scalar_uint1 = Buffer(context_, args.scalar_size); x_vec1.Write(queue_, args.x_size, x_source_); y_vec1.Write(queue_, args.y_size, y_source_); a_mat1.Write(queue_, args.a_size, a_source_); b_mat1.Write(queue_, args.b_size, b_source_); c_mat1.Write(queue_, args.c_size, c_source_); ap_mat1.Write(queue_, args.ap_size, ap_source_); scalar1.Write(queue_, args.scalar_size, scalar_source_); auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1, scalar_uint1}; // Runs the reference code if (verbose_) { if (compare_clblas_) { fprintf(stdout, " [clBLAS]"); } else if (compare_cblas_) { fprintf(stdout, " [CPU BLAS]"); } std::cout << std::flush; } auto status1 = StatusCode::kSuccess; if (compare_clblas_) { status1 = run_reference1_(args, buffers1, queue_); } else if (compare_cblas_) { status1 = run_reference2_(args, buffers1, queue_); } // Tests for equality of the two status codes if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; } if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) { TestErrorCodes(status1, status2, args); continue; } // Downloads the results auto result1 = get_result_(args, buffers1, queue_); auto result2 = get_result_(args, buffers2, queue_); // Computes the L2 error auto l2error = 0.0; const auto kErrorMarginL2 = getL2ErrorMargin(); for (auto id1=size_t{0}; id1(get_id1_(args) * get_id2_(args)); // Checks for differences in the output auto errors = size_t{0}; for (auto id1=size_t{0}; id1= kErrorMarginL2) { errors++; } if (verbose_) { if (get_id2_(args) == 1) { std::cout << std::endl << " Error at index " << id1 << ": "; } else { std::cout << std::endl << " Error at " << id1 << "," << id2 << ": "; } std::cout << " " << ToString(result1[index]) << " (reference) versus "; std::cout << " " << ToString(result2[index]) << " (CLBlast)"; if (l2error < kErrorMarginL2) { std::cout << " - error suppressed by a low total L2 error" << std::endl; } } } } } // Checks for differences in the 'canary' region to detect buffer overflows for (auto canary_id=size_t{0}; canary_id= result1.size() || index >= result2.size()) { continue; } if (!TestSimilarity(result1[index], result2[index])) { errors++; if (verbose_) { if (get_id2_(args) == 1) { std::cout << std::endl << " Buffer overflow index " << index << ": "; } else { std::cout << std::endl << " Buffer overflow " << index << ": "; } std::cout << " " << ToString(result1[index]) << " (reference) versus "; std::cout << " " << ToString(result2[index]) << " (CLBlast)"; } } } // Report the results if (verbose_ && errors > 0) { fprintf(stdout, "\n Combined average L2 error: %.2e\n ", l2error); } // Tests the error count (should be zero) TestErrorCount(errors, get_id1_(args)*get_id2_(args) + kCanarySize, args); } TestEnd(); } // ================================================================================================= // Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types, // does not test for results (if any). template void TestBlas::TestInvalid(std::vector> &test_vector, const std::string &name) { if (!PrecisionSupported(device_)) { return; } if (!compare_clblas_) { return; } // not supported for CPU BLAS routines if (std::is_same::value) { return; } // not supported for half-precision TestStart("invalid buffer sizes", name); // Iterates over all the to-be-tested combinations of arguments for (const auto &args: test_vector) { // Prints the current test configuration if (verbose_) { fprintf(stdout, " Testing: %s", GetSizesString(args).c_str()); std::cout << std::flush; } // Creates the buffers. Note: we are not using the cxpp11.h C++ version since we explicitly // want to be able to create invalid buffers (no error checking here). auto x_vec1 = CreateInvalidBuffer(context_, args.x_size); auto y_vec1 = CreateInvalidBuffer(context_, args.y_size); auto a_mat1 = CreateInvalidBuffer(context_, args.a_size); auto b_mat1 = CreateInvalidBuffer(context_, args.b_size); auto c_mat1 = CreateInvalidBuffer(context_, args.c_size); auto ap_mat1 = CreateInvalidBuffer(context_, args.ap_size); auto scalar1 = CreateInvalidBuffer(context_, args.scalar_size); auto scalar_uint1 = CreateInvalidBuffer(context_, args.scalar_size); auto x_vec2 = CreateInvalidBuffer(context_, args.x_size); auto y_vec2 = CreateInvalidBuffer(context_, args.y_size); auto a_mat2 = CreateInvalidBuffer(context_, args.a_size); auto b_mat2 = CreateInvalidBuffer(context_, args.b_size); auto c_mat2 = CreateInvalidBuffer(context_, args.c_size); auto ap_mat2 = CreateInvalidBuffer(context_, args.ap_size); auto scalar2 = CreateInvalidBuffer(context_, args.scalar_size); auto scalar_uint2 = CreateInvalidBuffer(context_, args.scalar_size); auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1, scalar_uint1}; auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2, scalar_uint2}; // Runs CLBlast if (verbose_) { fprintf(stdout, "[CLBlast]"); std::cout << std::flush; } const auto status2 = run_routine_(args, buffers2, queue_); // Runs the reference code if (verbose_) { if (compare_clblas_) { fprintf(stdout, " [clBLAS]"); } else if (compare_cblas_) { fprintf(stdout, " [CPU BLAS]"); } std::cout << std::flush; } auto status1 = StatusCode::kSuccess; if (compare_clblas_) { status1 = run_reference1_(args, buffers1, queue_); } else if (compare_cblas_) { status1 = run_reference2_(args, buffers1, queue_); } // Tests for equality of the two status codes if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; } TestErrorCodes(status1, status2, args); } TestEnd(); } // ================================================================================================= // Compiles the templated class template class TestBlas; template class TestBlas; template class TestBlas; template class TestBlas; template class TestBlas; template class TestBlas; template class TestBlas; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/test/correctness/testblas.hpp000066400000000000000000000640701463263031500206620ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file tests any CLBlast routine. It contains two types of tests: one testing all sorts of // input combinations, and one deliberatly testing with invalid values. // Typename T: the data-type of the routine's memory buffers (==precision) // Typename U: the data-type of the alpha and beta arguments // // ================================================================================================= #ifndef CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_ #define CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_ #include #include #include #include "test/correctness/tester.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestBlas: public Tester { public: static const int kSeed; // Uses several variables from the Tester class using Tester::context_; using Tester::queue_; using Tester::full_test_; using Tester::verbose_; using Tester::device_; using Tester::compare_clblas_; using Tester::compare_cblas_; // Uses several helper functions from the Tester class using Tester::TestStart; using Tester::TestEnd; using Tester::TestErrorCount; using Tester::TestErrorCodes; using Tester::GetOffsets; using Tester::GetOptionsString; using Tester::GetSizesString; // Test settings for the regular test. Append to these lists in case more tests are required. static const std::vector kVectorDims; static const std::vector kIncrements; static const std::vector kMatrixDims; static const std::vector kMatrixVectorDims; static const std::vector kBandSizes; static const std::vector kPadSizes; static const std::vector kDilationSizes; static const std::vector kKernelSizes; static const std::vector kBatchCounts; static const std::vector kNumKernels; static const std::vector kStrideValues; static const std::vector kChannelValues; static const std::vector kKernelModes; const std::vector kOffsets; const std::vector kAlphaValues; const std::vector kBetaValues; // Test settings for the invalid tests static const std::vector kInvalidIncrements; static const size_t kBufferSize; static const std::vector kMatSizes; static const std::vector kVecSizes; // The layout/transpose/triangle options to test with static const std::vector kLayouts; static const std::vector kTriangles; static const std::vector kSides; static const std::vector kDiagonals; static const std::vector kTransposes; // Data-type dependent, see .cpp-file // Shorthand for the routine-specific functions passed to the tester using DataPrepare = std::function&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&)>; using Routine = std::function&, Buffers&, Queue&)>; using ResultGet = std::function(const Arguments&, Buffers&, Queue&)>; using ResultIndex = std::function&, const size_t, const size_t)>; using ResultIterator = std::function&)>; // Constructor, initializes the base class tester and input data TestBlas(const std::vector &arguments, const bool silent, const std::string &name, const std::vector &options, const DataPrepare prepare_data, const Routine run_routine, const Routine run_reference1, const Routine run_reference2, const ResultGet get_result, const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2); // The test functions, taking no inputs void TestRegular(std::vector> &test_vector, const std::string &name); void TestInvalid(std::vector> &test_vector, const std::string &name); private: // Source data to test with std::vector x_source_; std::vector y_source_; std::vector a_source_; std::vector b_source_; std::vector c_source_; std::vector ap_source_; std::vector scalar_source_; // The routine-specific functions passed to the tester const DataPrepare prepare_data_; const Routine run_routine_; const Routine run_reference1_; const Routine run_reference2_; const ResultGet get_result_; const ResultIndex get_index_; const ResultIterator get_id1_; const ResultIterator get_id2_; }; // ================================================================================================= template const int TestBlas::kSeed = 42; // fixed seed for reproducibility // Test settings for the regular test. Append to these lists in case more tests are required. template const std::vector TestBlas::kVectorDims = { 7, 93, 144, 4096 }; template const std::vector TestBlas::kIncrements = { 1, 2, 7 }; template const std::vector TestBlas::kMatrixDims = { 7, 64 }; template const std::vector TestBlas::kMatrixVectorDims = { 61, 256 }; template const std::vector TestBlas::kBandSizes = { 4, 19 }; template const std::vector TestBlas::kBatchCounts = { 1, 3 }; template const std::vector TestBlas::kPadSizes = { 0, 1 }; template const std::vector TestBlas::kDilationSizes = { 1, 2 }; template const std::vector TestBlas::kKernelSizes = { 1, 3 }; template const std::vector TestBlas::kNumKernels = { 1, 6 }; template const std::vector TestBlas::kStrideValues = { 1, 3 }; template const std::vector TestBlas::kChannelValues = { 1, 2 }; template const std::vector TestBlas::kKernelModes = { KernelMode::kCrossCorrelation, KernelMode::kConvolution }; // Test settings for the invalid tests template const std::vector TestBlas::kInvalidIncrements = { 0, 1 }; template const size_t TestBlas::kBufferSize = 64; template const std::vector TestBlas::kMatSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize}; template const std::vector TestBlas::kVecSizes = {0, kBufferSize - 1, kBufferSize}; // The layout/triangle options to test with template const std::vector TestBlas::kLayouts = {Layout::kRowMajor, Layout::kColMajor}; template const std::vector TestBlas::kTriangles = {Triangle::kUpper, Triangle::kLower}; template const std::vector TestBlas::kSides = {Side::kLeft, Side::kRight}; template const std::vector TestBlas::kDiagonals = {Diagonal::kUnit, Diagonal::kNonUnit}; // ================================================================================================= // Bogus reference function, in case a comparison library is not available template static StatusCode ReferenceNotAvailable(const Arguments &, BufferType &, Queue &) { return StatusCode::kNotImplemented; } // Helper for the below function: MSVC's C1061 error requires part of the for-loops to be in a // separate file. This part handles the im2col/xconv arguments. template void handle_remaining_of_options(std::vector> ®ular_test_vector, Arguments &r_args, TestBlas &tester, const std::vector &kernel_modes, const std::vector &channelss, const std::vector &heights, const std::vector &widths, const std::vector &kernel_hs, const std::vector &kernel_ws, const std::vector &pad_hs, const std::vector &pad_ws, const std::vector &stride_hs, const std::vector &stride_ws, const std::vector &dilation_hs, const std::vector &dilation_ws, const std::vector &batch_counts, const std::vector &num_kernelss) { for (auto &kernel_mode: kernel_modes) { r_args.kernel_mode = kernel_mode; for (auto &channels: channelss) { r_args.channels = channels; for (auto &height: heights) { r_args.height = height; for (auto &width: widths) { r_args.width = width; for (auto &kernel_h: kernel_hs) { r_args.kernel_h = kernel_h; for (auto &kernel_w: kernel_ws) { r_args.kernel_w = kernel_w; for (auto &pad_h: pad_hs) { r_args.pad_h = pad_h; for (auto &pad_w: pad_ws) { r_args.pad_w = pad_w; for (auto &stride_h: stride_hs) { r_args.stride_h = stride_h; for (auto &stride_w: stride_ws) { r_args.stride_w = stride_w; for (auto &dilation_h: dilation_hs) { r_args.dilation_h = dilation_h; for (auto &dilation_w: dilation_ws) { r_args.dilation_w = dilation_w; for (auto &batch_count: batch_counts) { r_args.batch_count = batch_count; for (auto &num_kernels: num_kernelss) { r_args.num_kernels = num_kernels; C::SetSizes(r_args, tester.queue_); regular_test_vector.push_back(r_args); } } } } } } } } } } } } } } } // The interface to the correctness tester. This is a separate function in the header such that it // is automatically compiled for each routine, templated by the parameter "C". template size_t RunTests(int argc, char *argv[], const bool silent, const std::string &name) { auto command_line_args = RetrieveCommandLineArguments(argc, argv); // Sets the clBLAS reference to test against #ifdef CLBLAST_REF_CLBLAS auto reference_routine1 = C::RunReference1; // clBLAS when available #else auto reference_routine1 = ReferenceNotAvailable>; #endif // Sets the CBLAS reference to test against #ifdef CLBLAST_REF_CBLAS auto reference_routine2 = [](const Arguments &args, Buffers &buffers, Queue &queue) -> StatusCode { auto buffers_host = BuffersHost(); DeviceToHost(args, buffers, buffers_host, queue, C::BuffersIn()); C::RunReference2(args, buffers_host, queue); HostToDevice(args, buffers, buffers_host, queue, C::BuffersOut()); return StatusCode::kSuccess; }; #else auto reference_routine2 = ReferenceNotAvailable>; #endif // Non-BLAS routines cannot be fully tested if (!silent && C::BLASLevel() == 4) { fprintf(stdout, "\n* NOTE: This non-BLAS routine is tested against a custom implementation,\n"); fprintf(stdout, " not against clBLAS or a CPU BLAS library. Thus, the arguments '-clblas'\n"); fprintf(stdout, " and '-cblas' have no effect.\n"); } // Creates a tester auto options = C::GetOptions(); TestBlas tester{command_line_args, silent, name, options, C::PrepareData, C::RunRoutine, reference_routine1, reference_routine2, C::DownloadResult, C::GetResultIndex, C::ResultID1, C::ResultID2}; // This variable holds the arguments relevant for this routine auto args = Arguments{}; // Initializes the vectors with a single element. If this particular option is relevant for this // routine, this vector is overridden. Otherwise, it is unused - the value here does not matter. auto ms = std::vector{args.m}; auto ns = std::vector{args.n}; auto ks = std::vector{args.k}; auto kus = std::vector{args.ku}; auto kls = std::vector{args.kl}; auto layouts = std::vector{args.layout}; auto a_transposes = std::vector{args.a_transpose}; auto b_transposes = std::vector{args.b_transpose}; auto sides = std::vector{args.side}; auto triangles = std::vector{args.triangle}; auto diagonals = std::vector{args.diagonal}; auto x_incs = std::vector{args.x_inc}; auto y_incs = std::vector{args.y_inc}; auto x_offsets = std::vector{args.x_offset}; auto y_offsets = std::vector{args.y_offset}; auto a_lds = std::vector{args.a_ld}; auto b_lds = std::vector{args.b_ld}; auto c_lds = std::vector{args.c_ld}; auto a_offsets = std::vector{args.a_offset}; auto b_offsets = std::vector{args.b_offset}; auto c_offsets = std::vector{args.c_offset}; auto ap_offsets = std::vector{args.ap_offset}; auto dot_offsets = std::vector{args.dot_offset}; auto nrm2_offsets = std::vector{args.nrm2_offset}; auto asum_offsets = std::vector{args.asum_offset}; auto imax_offsets = std::vector{args.imax_offset}; auto alphas = std::vector{args.alpha}; auto betas = std::vector{args.beta}; auto kernel_modes = std::vector{args.kernel_mode}; auto channelss = std::vector{args.channels}; auto heights = std::vector{args.height}; auto widths = std::vector{args.width}; auto kernel_hs = std::vector{args.kernel_h}; auto kernel_ws = std::vector{args.kernel_w}; auto pad_hs = std::vector{args.pad_h}; auto pad_ws = std::vector{args.pad_w}; auto stride_hs = std::vector{args.stride_h}; auto stride_ws = std::vector{args.stride_w}; auto dilation_hs = std::vector{args.dilation_h}; auto dilation_ws = std::vector{args.dilation_w}; auto batch_counts = std::vector{args.batch_count}; auto num_kernelss = std::vector{args.num_kernels}; auto x_sizes = std::vector{args.x_size}; auto y_sizes = std::vector{args.y_size}; auto a_sizes = std::vector{args.a_size}; auto b_sizes = std::vector{args.b_size}; auto c_sizes = std::vector{args.c_size}; auto ap_sizes = std::vector{args.ap_size}; // Sets the dimensions of the matrices or vectors depending on the BLAS level auto dimensions = (C::BLASLevel() == 4) ? tester.kMatrixDims : // non-BLAS extra routines (C::BLASLevel() == 3) ? tester.kMatrixDims : // level 3 (C::BLASLevel() == 2) ? tester.kMatrixVectorDims : // level 2 tester.kVectorDims; // else: level 1 // For the options relevant to this routine, sets the vectors to proper values for (auto &option: options) { if (option == kArgM) { ms = dimensions; } if (option == kArgN) { ns = dimensions; } if (option == kArgK) { ks = dimensions; } if (option == kArgKU) { kus = tester.kBandSizes; } if (option == kArgKL) { kls = tester.kBandSizes; } if (option == kArgLayout) { layouts = tester.kLayouts; } if (option == kArgATransp) { a_transposes = C::GetATransposes(tester.kTransposes); } if (option == kArgBTransp) { b_transposes = C::GetBTransposes(tester.kTransposes); } if (option == kArgSide) { sides = tester.kSides; } if (option == kArgTriangle) { triangles = tester.kTriangles; } if (option == kArgDiagonal) { diagonals = tester.kDiagonals; } if (option == kArgXInc) { x_incs = tester.kIncrements; } if (option == kArgYInc) { y_incs = tester.kIncrements; } if (option == kArgXOffset) { x_offsets = tester.kOffsets; } if (option == kArgYOffset) { y_offsets = tester.kOffsets; } if (option == kArgALeadDim) { a_lds = dimensions; } if (option == kArgBLeadDim) { b_lds = dimensions; } if (option == kArgCLeadDim) { c_lds = dimensions; } if (option == kArgAOffset) { a_offsets = tester.kOffsets; } if (option == kArgBOffset) { b_offsets = tester.kOffsets; } if (option == kArgCOffset) { c_offsets = tester.kOffsets; } if (option == kArgAPOffset) { ap_offsets = tester.kOffsets; } if (option == kArgDotOffset) { dot_offsets = tester.kOffsets; } if (option == kArgNrm2Offset) { nrm2_offsets = tester.kOffsets; } if (option == kArgAsumOffset) { asum_offsets = tester.kOffsets; } if (option == kArgImaxOffset) { imax_offsets = tester.kOffsets; } if (option == kArgAlpha) { alphas = tester.kAlphaValues; } if (option == kArgBeta) { betas = tester.kBetaValues; } if (option == kArgKernelMode) { kernel_modes = tester.kKernelModes; } if (option == kArgChannels) { channelss = tester.kChannelValues; } if (option == kArgHeight) { heights = tester.kMatrixDims; } if (option == kArgWidth) { widths = tester.kMatrixDims; } if (option == kArgKernelH) { kernel_hs = tester.kKernelSizes; } if (option == kArgKernelW) { kernel_ws = tester.kKernelSizes; } if (option == kArgPadH) { pad_hs = tester.kPadSizes; } if (option == kArgPadW) { pad_ws = tester.kPadSizes; } if (option == kArgStrideH) { stride_hs = tester.kStrideValues; } if (option == kArgStrideW) { stride_ws = tester.kStrideValues; } if (option == kArgDilationH) { dilation_hs = tester.kDilationSizes; } if (option == kArgDilationW) { dilation_ws = tester.kDilationSizes; } if (option == kArgBatchCount) { batch_counts = tester.kBatchCounts; } if (option == kArgNumKernels) { num_kernelss = tester.kNumKernels; } if (option == kArgXOffset) { x_sizes = tester.kVecSizes; } if (option == kArgYOffset) { y_sizes = tester.kVecSizes; } if (option == kArgAOffset) { a_sizes = tester.kMatSizes; } if (option == kArgBOffset) { b_sizes = tester.kMatSizes; } if (option == kArgCOffset) { c_sizes = tester.kMatSizes; } if (option == kArgAPOffset) { ap_sizes = tester.kMatSizes; } } // Loops over the test-cases from a data-layout point of view for (auto &layout: layouts) { args.layout = layout; for (auto &a_transpose: a_transposes) { args.a_transpose = a_transpose; for (auto &b_transpose: b_transposes) { args.b_transpose = b_transpose; for (auto &side: sides) { args.side = side; for (auto &triangle: triangles) { args.triangle = triangle; for (auto &diagonal: diagonals) { args.diagonal = diagonal; // Creates the arguments vector for the regular tests auto regular_test_vector = std::vector>{}; auto r_args = args; for (auto &m: ms) { r_args.m = m; for (auto &n: ns) { r_args.n = n; for (auto &k: ks) { r_args.k = k; for (auto &ku: kus) { r_args.ku = ku; for (auto &kl: kls) { r_args.kl = kl; for (auto &x_inc: x_incs) { r_args.x_inc = x_inc; for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset; for (auto &y_inc: y_incs) { r_args.y_inc = y_inc; for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset; for (auto &a_ld: a_lds) { r_args.a_ld = a_ld; for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset; for (auto &b_ld: b_lds) { r_args.b_ld = b_ld; for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset; for (auto &c_ld: c_lds) { r_args.c_ld = c_ld; for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset; for (auto &ap_offset: ap_offsets) { r_args.ap_offset = ap_offset; for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset; for (auto &nrm2_offset: nrm2_offsets) { r_args.nrm2_offset = nrm2_offset; for (auto &asum_offset: asum_offsets) { r_args.asum_offset = asum_offset; for (auto &imax_offset: imax_offsets) { r_args.imax_offset = imax_offset; for (auto &alpha: alphas) { r_args.alpha = alpha; for (auto &beta: betas) { r_args.beta = beta; // Cannot have more for-loops because of MSVC's C1061 error handle_remaining_of_options(regular_test_vector, r_args, tester, kernel_modes, channelss, heights, widths, kernel_hs, kernel_ws, pad_hs, pad_ws, stride_hs, stride_ws, dilation_hs, dilation_ws, batch_counts, num_kernelss); } } } } } } } } } } } } } } } } } } } } } } // Creates the arguments vector for the invalid-buffer tests #ifdef CLBLAST_REF_CLBLAS auto invalid_test_vector = std::vector>{}; auto i_args = args; i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize; i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize; i_args.batch_count = 3; i_args.alphas = std::vector(i_args.batch_count); i_args.betas = std::vector(i_args.batch_count); i_args.a_offsets = std::vector(i_args.batch_count); i_args.b_offsets = std::vector(i_args.batch_count); i_args.c_offsets = std::vector(i_args.batch_count); for (auto &x_size: x_sizes) { i_args.x_size = x_size; for (auto &y_size: y_sizes) { i_args.y_size = y_size; for (auto &a_size: a_sizes) { i_args.a_size = a_size; for (auto &b_size: b_sizes) { i_args.b_size = b_size; for (auto &c_size: c_sizes) { i_args.c_size = c_size; for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size; invalid_test_vector.push_back(i_args); } } } } } } #endif // Sets the name of this test-case auto names = std::vector{}; for (auto &option: options) { if (option == kArgLayout) { names.push_back(ToString(layout)); } if (option == kArgATransp) { names.push_back(ToString(a_transpose)); } if (option == kArgBTransp) { names.push_back(ToString(b_transpose)); } if (option == kArgSide) { names.push_back(ToString(side)); } if (option == kArgTriangle) { names.push_back(ToString(triangle)); } if (option == kArgDiagonal) { names.push_back(ToString(diagonal)); } } if (names.size() == 0) { names.push_back("default"); } auto case_name = std::string{}; for (auto i=size_t{0}; i // // This file implements the Tester class (see the header for information about the class). // // ================================================================================================= #include #include #include #include #include #include #include "test/correctness/tester.hpp" namespace clblast { // ================================================================================================= // Relative error margins template float getRelativeErrorMargin() { return 0.005f; // 0.5% is considered acceptable for float/double-precision } template float getRelativeErrorMargin(); // as the above default template float getRelativeErrorMargin(); // as the above default template float getRelativeErrorMargin(); // as the above default template float getRelativeErrorMargin(); // as the above default template <> float getRelativeErrorMargin() { return 0.080f; // 8% (!) error is considered acceptable for half-precision } // Absolute error margins template float getAbsoluteErrorMargin() { return 0.001f; } template float getAbsoluteErrorMargin(); // as the above default template float getAbsoluteErrorMargin(); // as the above default template float getAbsoluteErrorMargin(); // as the above default template float getAbsoluteErrorMargin(); // as the above default template <> float getAbsoluteErrorMargin() { return 0.15f; // especially small values are inaccurate for half-precision } // L2 error margins template double getL2ErrorMargin() { return 0.0f; // zero means don't look at the L2 error margin at all, use the other metrics } template double getL2ErrorMargin(); // as the above default template double getL2ErrorMargin(); // as the above default template double getL2ErrorMargin(); // as the above default template double getL2ErrorMargin(); // as the above default template <> double getL2ErrorMargin() { return 0.05; // half-precision results are considered OK as long as the L2 error is low enough } // Error margin: numbers beyond this value are considered equal to inf or NaN template T getAlmostInfNumber() { return static_cast(1e35); // used for correctness testing of TRSV and TRSM routines } // ================================================================================================= // General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up // the clBLAS library for reference. template Tester::Tester(const std::vector &arguments, const bool silent, const std::string &name, const std::vector &options): help_("Options given/available:\n"), platform_(Platform(GetArgument(arguments, help_, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})))), device_(Device(platform_, GetArgument(arguments, help_, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})))), context_(Context(device_)), queue_(Queue(context_, device_)), full_test_(CheckArgument(arguments, help_, kArgFullTest)), verbose_(CheckArgument(arguments, help_, kArgVerbose)), error_log_{}, num_passed_{0}, num_skipped_{0}, num_failed_{0}, print_count_{0}, tests_passed_{0}, tests_skipped_{0}, tests_failed_{0} { options_ = options; // Determines which reference is the default #if defined(CLBLAST_REF_CBLAS) auto default_cblas = 0; #endif #if defined(CLBLAST_REF_CLBLAS) auto default_clblas = 0; #endif #if defined(CLBLAST_REF_CUBLAS) auto default_cublas = 0; #endif #if defined(CLBLAST_REF_CBLAS) default_cblas = 1; #elif defined(CLBLAST_REF_CLBLAS) default_clblas = 1; #elif defined(CLBLAST_REF_CUBLAS) default_cublas = 1; #endif // Determines which reference to test against compare_clblas_ = 0; compare_cblas_ = 0; compare_cublas_ = 0; #if defined(CLBLAST_REF_CBLAS) compare_cblas_ = GetArgument(arguments, help_, kArgComparecblas, default_cblas); #endif #if defined(CLBLAST_REF_CLBLAS) compare_clblas_ = GetArgument(arguments, help_, kArgCompareclblas, default_clblas); #endif #if defined(CLBLAST_REF_CUBLAS) compare_cublas_ = GetArgument(arguments, help_, kArgComparecublas, default_cublas); #endif // Prints the help message (command-line arguments) if (!silent) { fprintf(stdout, "\n* %s\n", help_.c_str()); } // Support for cuBLAS not available yet if (compare_cublas_) { throw std::runtime_error("Cannot test against cuBLAS; not implemented yet"); } // Can only test against a single reference (not two, not zero) if (compare_clblas_ && compare_cblas_) { throw std::runtime_error("Cannot test against both clBLAS and CBLAS references; choose one using the -cblas and -clblas arguments"); } if (!compare_clblas_ && !compare_cblas_) { throw std::runtime_error("Choose one reference (clBLAS or CBLAS) to test against using the -cblas and -clblas arguments"); } // Prints the header fprintf(stdout, "* Running on OpenCL device '%s'.\n", GetDeviceName(device_).c_str()); fprintf(stdout, "* Starting tests for the %s'%s'%s routine.", kPrintMessage.c_str(), name.c_str(), kPrintEnd.c_str()); // Checks whether the precision is supported if (!PrecisionSupported(device_)) { fprintf(stdout, "\n* All tests skipped: %sUnsupported precision%s\n", kPrintWarning.c_str(), kPrintEnd.c_str()); return; } // Prints the legend fprintf(stdout, " Legend:\n"); fprintf(stdout, " %s -> Test produced correct results\n", kSuccessData.c_str()); fprintf(stdout, " %s -> Test returned the correct error code\n", kSuccessStatus.c_str()); fprintf(stdout, " %s -> Test produced incorrect results\n", kErrorData.c_str()); fprintf(stdout, " %s -> Test returned an incorrect error code\n", kErrorStatus.c_str()); fprintf(stdout, " %s -> Test not executed: OpenCL-kernel compilation error\n", kSkippedCompilation.c_str()); fprintf(stdout, " %s -> Test not executed: Unsupported precision\n", kUnsupportedPrecision.c_str()); fprintf(stdout, " %s -> Test not completed: Reference CBLAS doesn't output error codes\n", kUnsupportedReference.c_str()); fprintf(stdout, "* Testing with error margins of %.1lf%% (relative) and %.3lf (absolute)\n", 100.0f * getRelativeErrorMargin(), getAbsoluteErrorMargin()); if (getL2ErrorMargin() != 0.0f) { fprintf(stdout, "* and a combined maximum allowed L2 error of %.2e\n", getL2ErrorMargin()); } // Initializes clBLAS #ifdef CLBLAST_REF_CLBLAS if (compare_clblas_) { auto status = clblasSetup(); if (status != CL_SUCCESS) { throw std::runtime_error("clBLAS setup error: "+ToString(static_cast(status))); } } #endif } // Destructor prints the summary of the test cases and cleans-up the clBLAS library template Tester::~Tester() { if (PrecisionSupported(device_)) { std::cout << "* Completed all test-cases for this routine. Results:" << std::endl; std::cout << " " << tests_passed_ << " test(s) passed" << std::endl; if (tests_skipped_ > 0) { std::cout << kPrintWarning; } std::cout << " " << tests_skipped_ << " test(s) skipped" << kPrintEnd << std::endl; if (tests_failed_ > 0) { std::cout << kPrintError; } std::cout << " " << tests_failed_ << " test(s) failed" << kPrintEnd << std::endl; } std::cout << std::endl; // Cleans-up clBLAS #ifdef CLBLAST_REF_CLBLAS if (compare_clblas_) { clblasTeardown(); } #endif } // ================================================================================================= // Function called at the start of each test. This prints a header with information about the // test and re-initializes all test data-structures. template void Tester::TestStart(const std::string &test_name, const std::string &test_configuration) { // Prints the header fprintf(stdout, "* Testing %s'%s'%s for %s'%s'%s:\n", kPrintMessage.c_str(), test_name.c_str(), kPrintEnd.c_str(), kPrintMessage.c_str(), test_configuration.c_str(), kPrintEnd.c_str()); if (!verbose_) { fprintf(stdout, " "); } // Empties the error log and the error/pass counters error_log_.clear(); num_passed_ = 0; num_skipped_ = 0; num_failed_ = 0; print_count_ = 0; } // Function called at the end of each test. This prints errors if any occured. It also prints a // summary of the number of sub-tests passed/failed. template void Tester::TestEnd() { if (!verbose_) { fprintf(stdout, "\n"); } tests_passed_ += num_passed_; tests_skipped_ += num_skipped_; tests_failed_ += num_failed_; // Prints the errors PrintErrorLog(error_log_); // Prints a test summary auto pass_rate = 100*num_passed_ / static_cast(num_passed_ + num_skipped_ + num_failed_); fprintf(stdout, " Pass rate %s%5.1lf%%%s:", kPrintMessage.c_str(), pass_rate, kPrintEnd.c_str()); std::cout << " " << num_passed_ << " passed /"; if (num_skipped_ != 0) { std::cout << " " << kPrintWarning << num_skipped_ << " skipped" << kPrintEnd << " /"; } else { std::cout << " " << num_skipped_ << " skipped /"; } if (num_failed_ != 0) { std::cout << " " << kPrintError << num_failed_ << " failed" << kPrintEnd << std::endl; } else { std::cout << " " << num_failed_ << " failed" << std::endl; } } // ================================================================================================= // Handles a 'pass' or 'error' depending on whether there are any errors template void Tester::TestErrorCount(const size_t errors, const size_t size, const Arguments &args) { // Finished successfully if (errors == 0) { PrintTestResult(kSuccessData); ReportPass(); } // Error(s) occurred else { auto percentage = 100*errors / static_cast(size); PrintTestResult(kErrorData); ReportError({StatusCode::kSuccess, StatusCode::kSuccess, percentage, args}); } } // Compares two status codes for equality. The outcome can be a pass (they are the same), a warning // (CLBlast reported a compilation error), or an error (they are different). template void Tester::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status, const Arguments &args) { // Either an OpenCL or CLBlast internal error occurred, fail the test immediately // NOTE: the OpenCL error codes grow downwards without any declared lower bound, hence the magic // number. The last error code is atm around -70, but -500 is chosen to be on the safe side. if (clblast_status != StatusCode::kSuccess && (clblast_status > static_cast(-500) /* matches OpenCL errors (see above) */ || clblast_status < StatusCode::kNotImplemented) /* matches CLBlast internal errors */) { PrintTestResult(kErrorStatus); ReportError({StatusCode::kSuccess, clblast_status, kStatusError, args}); if (verbose_) { fprintf(stdout, "\n"); PrintErrorLog({{StatusCode::kSuccess, clblast_status, kStatusError, args}}); fprintf(stdout, " "); } } // Routine is not implemented else if (clblast_status == StatusCode::kNotImplemented) { PrintTestResult(kSkippedCompilation); ReportSkipped(); } // Cannot compare error codes against a library other than clBLAS else if (compare_cblas_) { PrintTestResult(kUnsupportedReference); ReportSkipped(); } // Finished successfully else if (clblas_status == clblast_status) { PrintTestResult(kSuccessStatus); ReportPass(); } // No support for this kind of precision else if (clblast_status == StatusCode::kNoDoublePrecision || clblast_status == StatusCode::kNoHalfPrecision) { PrintTestResult(kUnsupportedPrecision); ReportSkipped(); } // Error occurred else { PrintTestResult(kErrorStatus); ReportError({clblas_status, clblast_status, kStatusError, args}); if (verbose_) { fprintf(stdout, "\n"); PrintErrorLog({{clblas_status, clblast_status, kStatusError, args}}); fprintf(stdout, " "); } } } // ================================================================================================= // Retrieves the offset values to test with template const std::vector Tester::GetOffsets() const { if (full_test_) { return {0, 10}; } else { return {0}; } } // Retrieves the options as a string for a specific test template std::string Tester::GetOptionsString(const Arguments &args) { auto result = std::string(""); const auto equals = std::string("="); for (auto &o: options_) { if (o == kArgM) { result += kArgM + equals + ToString(args.m) + " "; } if (o == kArgN) { result += kArgN + equals + ToString(args.n) + " "; } if (o == kArgK) { result += kArgK + equals + ToString(args.k) + " "; } if (o == kArgKU) { result += kArgKU + equals + ToString(args.ku) + " "; } if (o == kArgKL) { result += kArgKL + equals + ToString(args.kl) + " "; } if (o == kArgXInc) { result += kArgXInc + equals + ToString(args.x_inc) + " "; } if (o == kArgYInc) { result += kArgYInc + equals + ToString(args.y_inc) + " "; } if (o == kArgXOffset) { result += kArgXOffset + equals + ToString(args.x_offset) + " "; } if (o == kArgYOffset) { result += kArgYOffset + equals + ToString(args.y_offset) + " "; } if (o == kArgALeadDim) { result += kArgALeadDim + equals + ToString(args.a_ld) + " "; } if (o == kArgBLeadDim) { result += kArgBLeadDim + equals + ToString(args.b_ld) + " "; } if (o == kArgCLeadDim) { result += kArgCLeadDim + equals + ToString(args.c_ld) + " "; } if (o == kArgAOffset) { result += kArgAOffset + equals + ToString(args.a_offset) + " "; } if (o == kArgBOffset) { result += kArgBOffset + equals + ToString(args.b_offset) + " "; } if (o == kArgCOffset) { result += kArgCOffset + equals + ToString(args.c_offset) + " "; } if (o == kArgAPOffset) { result += kArgAPOffset + equals + ToString(args.ap_offset) + " "; } if (o == kArgDotOffset){ result += kArgDotOffset + equals + ToString(args.dot_offset) + " "; } if (o == kArgAlpha) { result += kArgAlpha + equals + ToString(args.alpha) + " "; } if (o == kArgBeta) { result += kArgBeta + equals + ToString(args.beta) + " "; } if (o == kArgBatchCount){result += kArgBatchCount + equals + ToString(args.batch_count) + " "; } if (o == kArgKernelMode){result += kArgKernelMode + equals + ToString(args.kernel_mode) + " "; } if (o == kArgChannels) { result += kArgChannels + equals + ToString(args.channels) + " "; } if (o == kArgHeight) { result += kArgHeight + equals + ToString(args.height) + " "; } if (o == kArgWidth) { result += kArgWidth + equals + ToString(args.width) + " "; } if (o == kArgNumKernels){result += kArgNumKernels + equals + ToString(args.num_kernels) + " "; } if (o == kArgKernelH) { result += kArgKernelH + equals + ToString(args.kernel_h) + " "; } if (o == kArgKernelW) { result += kArgKernelW + equals + ToString(args.kernel_w) + " "; } if (o == kArgPadH) { result += kArgPadH + equals + ToString(args.pad_h) + " "; } if (o == kArgPadW) { result += kArgPadW + equals + ToString(args.pad_w) + " "; } if (o == kArgStrideH) { result += kArgStrideH + equals + ToString(args.stride_h) + " "; } if (o == kArgStrideW) { result += kArgStrideW + equals + ToString(args.stride_w) + " "; } if (o == kArgDilationH){ result += kArgDilationH + equals + ToString(args.dilation_h) + " "; } if (o == kArgDilationW){ result += kArgDilationW + equals + ToString(args.dilation_w) + " "; } } return result; } // As above, but now only prints information relevant to invalid buffer sizes template std::string Tester::GetSizesString(const Arguments &args) { auto result = std::string(""); const auto equals = std::string("="); for (auto &o: options_) { if (o == kArgM) { result += kArgM + equals + ToString(args.m) + " "; } if (o == kArgN) { result += kArgN + equals + ToString(args.n) + " "; } if (o == kArgK) { result += kArgK + equals + ToString(args.k) + " "; } if (o == kArgXOffset) { result += "xsize" + equals + ToString(args.x_size) + " "; } if (o == kArgYOffset) { result += "ysize" + equals + ToString(args.y_size) + " "; } if (o == kArgAOffset) { result += "asize" + equals + ToString(args.a_size) + " "; } if (o == kArgBOffset) { result += "bsize" + equals + ToString(args.b_size) + " "; } if (o == kArgCOffset) { result += "csize" + equals + ToString(args.c_size) + " "; } if (o == kArgAPOffset) { result += "apsize" + equals + ToString(args.ap_size) + " "; } if (o == kArgDotOffset){ result += "scalarsize" + equals + ToString(args.scalar_size) + " "; } } return result; } // ================================================================================================= // A test can either pass, be skipped, or fail template void Tester::ReportPass() { num_passed_++; } template void Tester::ReportSkipped() { num_skipped_++; } template void Tester::ReportError(const ErrorLogEntry &error_log_entry) { error_log_.push_back(error_log_entry); num_failed_++; } // ================================================================================================= // Prints the test-result symbol to screen. This function limits the maximum number of symbols per // line by printing newlines once every so many calls. template void Tester::PrintTestResult(const std::string &message) { if (verbose_) { fprintf(stdout, "%s\n", message.c_str()); } else { if (print_count_ == kResultsPerLine) { print_count_ = 0; fprintf(stdout, "\n "); } fprintf(stdout, "%s", message.c_str()); print_count_++; } std::cout << std::flush; } // Prints details of errors occurred in a given error log template void Tester::PrintErrorLog(const std::vector &error_log) { for (auto &entry: error_log) { if (entry.error_percentage != kStatusError) { fprintf(stdout, " Error rate %.2lf%%: ", entry.error_percentage); } else { fprintf(stdout, " Status code %d (expected %d): ", static_cast(entry.status_found), static_cast(entry.status_expect)); } fprintf(stdout, "%s\n", GetOptionsString(entry.args).c_str()); } } // ================================================================================================= // Below are the non-member functions (separated because of otherwise required partial class // template specialization) // ================================================================================================= // Compares two floating point values and returns whether they are within an acceptable error // margin. This replaces GTest's EXPECT_NEAR(). template bool TestSimilarityNear(const T val1, const T val2, const T error_margin_absolute, const T error_margin_relative) { const auto difference = std::fabs(val1 - val2); // Shortcut, handles infinities if (val1 == val2) { return true; } // Handles cases with both results NaN or inf else if ((std::isnan(val1) && std::isnan(val2)) || (std::isinf(val1) && std::isinf(val2))) { return true; } // Also considers it OK if one of the results in NaN and the other is inf // Note: for TRSV and TRSM routines else if ((std::isnan(val1) && std::isinf(val2)) || (std::isinf(val1) && std::isnan(val2))) { return true; } // Also considers it OK if one of the values is super large and the other is inf or NaN // Note: for TRSV and TRSM routines else if ((std::abs(val1) > getAlmostInfNumber() && (std::isinf(val2) || std::isnan(val2))) || (std::abs(val2) > getAlmostInfNumber() && (std::isinf(val1) || std::isnan(val1)))) { return true; } // The values are zero or very small: the relative error is less meaningful else if (val1 == 0 || val2 == 0 || difference < error_margin_absolute) { return (difference < error_margin_absolute); } // Use relative error else { const auto absolute_sum = std::fabs(val1) + std::fabs(val2); return (difference / absolute_sum) < error_margin_relative; } } // Default method for similarity testing template bool TestSimilarity(const T val1, const T val2) { const auto kErrorMarginRelative = static_cast(getRelativeErrorMargin()); const auto kErrorMarginAbsolute = static_cast(getAbsoluteErrorMargin()); return TestSimilarityNear(val1, val2, kErrorMarginAbsolute, kErrorMarginRelative); } // Compiles the default case for standard data-types template bool TestSimilarity(const float, const float); template bool TestSimilarity(const double, const double); // Specialisations for non-standard data-types template <> bool TestSimilarity(const float2 val1, const float2 val2) { const auto real = TestSimilarity(val1.real(), val2.real()); const auto imag = TestSimilarity(val1.imag(), val2.imag()); if (real && imag) { return true; } // also OK if one is good and the combined is good (indicates a big diff between real & imag) if (real || imag) { return TestSimilarity(val1.real() + val1.imag(), val2.real() + val2.imag()); } return false; // neither real nor imag is good, return false } template <> bool TestSimilarity(const double2 val1, const double2 val2) { const auto real = TestSimilarity(val1.real(), val2.real()); const auto imag = TestSimilarity(val1.imag(), val2.imag()); if (real && imag) { return true; } // also OK if one is good and the combined is good (indicates a big diff between real & imag) if (real || imag) { return TestSimilarity(val1.real() + val1.imag(), val2.real() + val2.imag()); } return false; // neither real nor imag is good, return false } template <> bool TestSimilarity(const half val1, const half val2) { const auto kErrorMarginRelative = getRelativeErrorMargin(); const auto kErrorMarginAbsolute = getAbsoluteErrorMargin(); return TestSimilarityNear(HalfToFloat(val1), HalfToFloat(val2), kErrorMarginAbsolute, kErrorMarginRelative); } // ================================================================================================= // Retrieves a list of example scalar values, used for the alpha and beta arguments for the various // routines. This function is specialised for the different data-types. template <> const std::vector GetExampleScalars(const bool full_test) { if (full_test) { return {0.0f, 1.0f, 3.14f}; } else { return {3.14f}; } } template <> const std::vector GetExampleScalars(const bool full_test) { if (full_test) { return {0.0, 1.0, 3.14}; } else { return {3.14}; } } template <> const std::vector GetExampleScalars(const bool full_test) { if (full_test) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; } else { return {{2.42f, 3.14f}}; } } template <> const std::vector GetExampleScalars(const bool full_test) { if (full_test) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; } else { return {{2.42, 3.14}}; } } template <> const std::vector GetExampleScalars(const bool full_test) { if (full_test) { return {FloatToHalf(0.0f), FloatToHalf(1.0f), FloatToHalf(3.14f)}; } else { return {FloatToHalf(3.14f)}; } } // ================================================================================================= // Compiles the templated class template class Tester; template class Tester; template class Tester; template class Tester; template class Tester; template class Tester; template class Tester; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/test/correctness/tester.hpp000066400000000000000000000215501463263031500203430ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Tester class, providing a test-framework. GTest was used before, but // was not able to handle certain cases (e.g. template type + parameters). This is its (basic) // custom replacement. // Typename T: the data-type of the routine's memory buffers (==precision) // Typename U: the data-type of the alpha and beta arguments // // ================================================================================================= #ifndef CLBLAST_TEST_CORRECTNESS_TESTER_H_ #define CLBLAST_TEST_CORRECTNESS_TESTER_H_ #include #include #include #include "utilities/utilities.hpp" #include "test/test_utilities.hpp" // The libraries #ifdef CLBLAST_REF_CLBLAS #include #endif namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Tester { public: // Maximum number of test results printed on a single line static const size_t kResultsPerLine; // Error percentage is not applicable: error was caused by an incorrect status static const float kStatusError; // Constants holding start and end strings for terminal-output in colour static const std::string kPrintError; static const std::string kPrintSuccess; static const std::string kPrintWarning; static const std::string kPrintMessage; static const std::string kPrintEnd; // Sets the output error coding static const std::string kSuccessData; static const std::string kSuccessStatus; static const std::string kErrorData; static const std::string kErrorStatus; static const std::string kSkippedCompilation; static const std::string kUnsupportedPrecision; static const std::string kUnsupportedReference; // This structure combines the above log-entry with a status code an error percentage struct ErrorLogEntry { StatusCode status_expect; StatusCode status_found; float error_percentage; Arguments args; }; // Creates an instance of the tester, running on a particular OpenCL platform and device. It // takes the routine's names as an additional parameter. explicit Tester(const std::vector &arguments, const bool silent, const std::string &name, const std::vector &options); ~Tester(); // These methods start and end a test-case. Within a test-case, multiple tests can be run. void TestStart(const std::string &test_name, const std::string &test_configuration); void TestEnd(); // Tests either an error count (should be zero) or two error codes (must match) void TestErrorCount(const size_t errors, const size_t size, const Arguments &args); void TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status, const Arguments &args); // Returns the number of failed tests size_t NumFailedTests() const { return tests_failed_; } protected: // The help-message std::string help_; // The OpenCL objects (accessible by derived classes) Platform platform_; Device device_; Context context_; Queue queue_; // Whether or not to run the full test-suite or just a smoke test const bool full_test_; // Whether or not to print extra information when testing const bool verbose_; // Retrieves the offset values to test with const std::vector GetOffsets() const; // Retrieves the list of options as a string std::string GetOptionsString(const Arguments &args); // for regular tests std::string GetSizesString(const Arguments &args); // for invalid buffer sizes // Testing against reference implementations int compare_cblas_; int compare_clblas_; int compare_cublas_; private: // Internal methods to report a passed, skipped, or failed test void ReportPass(); void ReportSkipped(); void ReportError(const ErrorLogEntry &log_entry); // Prints the error or success symbol to screen void PrintTestResult(const std::string &message); // Prints an error log void PrintErrorLog(const std::vector &error_log); // Logging and counting occurrences of errors std::vector error_log_; size_t num_passed_; size_t num_skipped_; size_t num_failed_; // Counting the amount of errors printed on this row size_t print_count_; // Counting the number of test-cases with and without failures size_t tests_passed_; size_t tests_skipped_; size_t tests_failed_; // Arguments relevant for a specific routine std::vector options_; }; // Maximum number of test results printed on a single line template const size_t Tester::kResultsPerLine = size_t{64}; // Error percentage is not applicable: error was caused by an incorrect status template const float Tester::kStatusError = -1.0f; // Constants holding start and end strings for terminal-output in colour #if defined(_WIN32) template const std::string Tester::kPrintError = ""; template const std::string Tester::kPrintSuccess = ""; template const std::string Tester::kPrintWarning = ""; template const std::string Tester::kPrintMessage = ""; template const std::string Tester::kPrintEnd = ""; #else template const std::string Tester::kPrintError = "\x1b[31m"; template const std::string Tester::kPrintSuccess = "\x1b[32m"; template const std::string Tester::kPrintWarning = "\x1b[35m"; template const std::string Tester::kPrintMessage = "\x1b[1m"; template const std::string Tester::kPrintEnd = "\x1b[0m"; #endif // Sets the output error coding #if defined(_WIN32) template const std::string Tester::kSuccessData = ":"; // success template const std::string Tester::kSuccessStatus = "."; // success template const std::string Tester::kErrorData = "X"; // error template const std::string Tester::kErrorStatus = "/"; // error template const std::string Tester::kSkippedCompilation = "\\"; // warning template const std::string Tester::kUnsupportedPrecision = "o"; // warning template const std::string Tester::kUnsupportedReference = "-"; // warning #else template const std::string Tester::kSuccessData = "\x1b[32m:\x1b[0m"; // success template const std::string Tester::kSuccessStatus = "\x1b[32m.\x1b[0m"; // success template const std::string Tester::kErrorData = "\x1b[31mX\x1b[0m"; // error template const std::string Tester::kErrorStatus = "\x1b[31m/\x1b[0m"; // error template const std::string Tester::kSkippedCompilation = "\x1b[35m\\\x1b[0m"; // warning template const std::string Tester::kUnsupportedPrecision = "\x1b[35mo\x1b[0m"; // warning template const std::string Tester::kUnsupportedReference = "\x1b[35m-\x1b[0m"; // warning #endif // ================================================================================================= // Below are the non-member functions (separated because of otherwise required partial class // template specialization) // ================================================================================================= // Error margins template float getRelativeErrorMargin(); template float getAbsoluteErrorMargin(); template double getL2ErrorMargin(); // Compares two floating point values and returns whether they are within an acceptable error // margin. This replaces GTest's EXPECT_NEAR(). template bool TestSimilarity(const T val1, const T val2); // Retrieves a list of example scalar values, used for the alpha and beta arguments for the various // routines. This function is specialised for the different data-types. template const std::vector GetExampleScalars(const bool full_test); // ================================================================================================= } // namespace clblast // CLBLAST_TEST_CORRECTNESS_TESTER_H_ #endif CLBlast-1.6.3/test/diagnostics.cpp000066400000000000000000000104771463263031500170130ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains 'clinfo' like diagnostics specific for CLBlast (debugging) // // ================================================================================================= #include #include #include #include "utilities/timing.hpp" #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= void OpenCLDiagnostics(int argc, char *argv[]) { auto arguments = RetrieveCommandLineArguments(argc, argv); // Retrieves the arguments auto help = std::string{"Options given/available:\n"}; const auto platform_id = GetArgument(arguments, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); const auto device_id = GetArgument(arguments, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); fprintf(stdout, "\n* %s\n", help.c_str()); // Initializes OpenCL const auto platform = Platform(platform_id); const auto device = Device(platform, device_id); const auto context = Context(device); auto queue = Queue(context, device); // Finds device information const auto device_type = GetDeviceType(device); const auto device_vendor = GetDeviceVendor(device); const auto device_architecture = GetDeviceArchitecture(device); const auto device_name = GetDeviceName(device); printf("\n --- OpenCL device naming:\n"); printf("* Device type %s\n", device.Type().c_str()); printf("* Device name %s\n", device.Name().c_str()); printf("* Platform vendor %s\n", platform.Vendor().c_str()); printf("* Platform version %s\n", platform.Version().c_str()); // Prints the CLBlast specific device names printf("\n --- CLBlast device naming:\n"); printf("* Device type %s\n", device_type.c_str()); printf("* Device name %s\n", device_name.c_str()); printf("* Device vendor %s\n", device_vendor.c_str()); printf("* Device architecture %s\n", device_architecture.c_str()); // Selected OpenCL properties printf("\n --- OpenCL device properties:\n"); printf("* Max work group size %zu\n", device.MaxWorkGroupSize()); printf("* Max work item dimensions %zu\n", device.MaxWorkItemDimensions()); const auto max_work_item_sizes = device.MaxWorkItemSizes(); for (auto i = size_t{0}; i < max_work_item_sizes.size(); ++i) { printf("* - Max work item size #%zu %zu\n", i, max_work_item_sizes[i]); } printf("* Local memory size %zuKB\n", device.LocalMemSize()); printf("* Extensions:\n%s\n", device.Capabilities().c_str()); // Simple OpenCL benchmarking constexpr auto kNumRuns = 20; printf("\n --- Some OpenCL library benchmarks (functions from clpp11.h):\n"); printf("* queue.GetContext() %.4lf ms\n", TimeFunction(kNumRuns, [&](){queue.GetContext();} )); printf("* queue.GetDevice() %.4lf ms\n", TimeFunction(kNumRuns, [&](){queue.GetDevice();} )); printf("* device.Name() %.4lf ms\n", TimeFunction(kNumRuns, [&](){device.Name();} )); printf("* device.Vendor() %.4lf ms\n", TimeFunction(kNumRuns, [&](){device.Vendor();} )); printf("* device.Version() %.4lf ms\n", TimeFunction(kNumRuns, [&](){device.Version();} )); printf("* device.Platform() %.4lf ms\n", TimeFunction(kNumRuns, [&](){ device.PlatformID();} )); printf("* Buffer(context, 1024) %.4lf ms\n", TimeFunction(kNumRuns, [&](){Buffer(context, 1024);} )); printf("\n"); } // ================================================================================================= } // namespace clblast // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { clblast::OpenCLDiagnostics(argc, argv); return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/000077500000000000000000000000001463263031500162705ustar00rootroot00000000000000CLBlast-1.6.3/test/performance/client.cpp000066400000000000000000000621401463263031500202550ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the common functions for the client-test environment. // // ================================================================================================= #include #include #include #include #include #include #include #include "utilities/utilities.hpp" #include "test/performance/client.hpp" namespace clblast { // ================================================================================================= template const int Client::kSeed = 42; // fixed seed for reproducibility // Constructor template Client::Client(const Routine run_routine, const Reference1 run_reference1, const Reference2 run_reference2, const Reference3 run_reference3, const std::vector &options, const std::vector &buffers_in, const std::vector &buffers_out, const GetMetric get_flops, const GetMetric get_bytes): run_routine_(run_routine), run_reference1_(run_reference1), run_reference2_(run_reference2), run_reference3_(run_reference3), options_(options), buffers_in_(buffers_in), buffers_out_(buffers_out), get_flops_(get_flops), get_bytes_(get_bytes) { } // ================================================================================================= // Parses all arguments available for the CLBlast client testers. Some arguments might not be // applicable, but are searched for anyway to be able to create one common argument parser. All // arguments have a default value in case they are not found. template Arguments Client::ParseArguments(int argc, char *argv[], const size_t level, const GetMetric default_a_ld, const GetMetric default_b_ld, const GetMetric default_c_ld) { const auto command_line_args = RetrieveCommandLineArguments(argc, argv); auto args = Arguments{}; auto help = std::string{"\n* Options given/available:\n"}; // These are the options which are not for every client: they are optional for (auto &o: options_) { // Data-sizes if (o == kArgM) { args.m = GetArgument(command_line_args, help, kArgM, size_t{512}); } if (o == kArgN) { args.n = GetArgument(command_line_args, help, kArgN, size_t{512}); } if (o == kArgK) { args.k = GetArgument(command_line_args, help, kArgK, size_t{512}); } if (o == kArgKU) { args.ku = GetArgument(command_line_args, help, kArgKU, size_t{128}); } if (o == kArgKL) { args.kl = GetArgument(command_line_args, help, kArgKL, size_t{128}); } // Data-layouts if (o == kArgLayout) { args.layout = GetArgument(command_line_args, help, kArgLayout, Layout::kRowMajor); } if (o == kArgATransp) { args.a_transpose = GetArgument(command_line_args, help, kArgATransp, Transpose::kNo); } if (o == kArgBTransp) { args.b_transpose = GetArgument(command_line_args, help, kArgBTransp, Transpose::kNo); } if (o == kArgSide) { args.side = GetArgument(command_line_args, help, kArgSide, Side::kLeft); } if (o == kArgTriangle) { args.triangle = GetArgument(command_line_args, help, kArgTriangle, Triangle::kUpper); } if (o == kArgDiagonal) { args.diagonal = GetArgument(command_line_args, help, kArgDiagonal, Diagonal::kUnit); } // Vector arguments if (o == kArgXInc) { args.x_inc = GetArgument(command_line_args, help, kArgXInc, size_t{1}); } if (o == kArgYInc) { args.y_inc = GetArgument(command_line_args, help, kArgYInc, size_t{1}); } if (o == kArgXOffset) { args.x_offset = GetArgument(command_line_args, help, kArgXOffset, size_t{0}); } if (o == kArgYOffset) { args.y_offset = GetArgument(command_line_args, help, kArgYOffset, size_t{0}); } // Matrix arguments if (o == kArgALeadDim) { args.a_ld = GetArgument(command_line_args, help, kArgALeadDim, default_a_ld(args)); } if (o == kArgBLeadDim) { args.b_ld = GetArgument(command_line_args, help, kArgBLeadDim, default_b_ld(args)); } if (o == kArgCLeadDim) { args.c_ld = GetArgument(command_line_args, help, kArgCLeadDim, default_c_ld(args)); } if (o == kArgAOffset) { args.a_offset = GetArgument(command_line_args, help, kArgAOffset, size_t{0}); } if (o == kArgBOffset) { args.b_offset = GetArgument(command_line_args, help, kArgBOffset, size_t{0}); } if (o == kArgCOffset) { args.c_offset = GetArgument(command_line_args, help, kArgCOffset, size_t{0}); } if (o == kArgAPOffset) { args.ap_offset= GetArgument(command_line_args, help, kArgAPOffset, size_t{0}); } // Scalar result arguments if (o == kArgDotOffset) { args.dot_offset = GetArgument(command_line_args, help, kArgDotOffset, size_t{0}); } if (o == kArgNrm2Offset) { args.nrm2_offset = GetArgument(command_line_args, help, kArgNrm2Offset, size_t{0}); } if (o == kArgAsumOffset) { args.asum_offset = GetArgument(command_line_args, help, kArgAsumOffset, size_t{0}); } if (o == kArgImaxOffset) { args.imax_offset = GetArgument(command_line_args, help, kArgImaxOffset, size_t{0}); } // Batch arguments if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, size_t{1}); } // Scalar values if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar()); } if (o == kArgBeta) { args.beta = GetArgument(command_line_args, help, kArgBeta, GetScalar()); } // Arguments for im2col and convgemm if (o == kArgKernelMode){ args.kernel_mode = GetArgument(command_line_args, help, kArgKernelMode, KernelMode::kConvolution); } if (o == kArgChannels) { args.channels = GetArgument(command_line_args, help, kArgChannels, size_t{64}); } if (o == kArgHeight) { args.height = GetArgument(command_line_args, help, kArgHeight, size_t{64}); } if (o == kArgWidth) { args.width = GetArgument(command_line_args, help, kArgWidth, size_t{64}); } if (o == kArgKernelH) { args.kernel_h = GetArgument(command_line_args, help, kArgKernelH, size_t{3}); } if (o == kArgKernelW) { args.kernel_w = GetArgument(command_line_args, help, kArgKernelW, size_t{3}); } if (o == kArgPadH) { args.pad_h = GetArgument(command_line_args, help, kArgPadH, size_t{0}); } if (o == kArgPadW) { args.pad_w = GetArgument(command_line_args, help, kArgPadW, size_t{0}); } if (o == kArgStrideH) { args.stride_h = GetArgument(command_line_args, help, kArgStrideH, size_t{1}); } if (o == kArgStrideW) { args.stride_w = GetArgument(command_line_args, help, kArgStrideW, size_t{1}); } if (o == kArgDilationH) { args.dilation_h = GetArgument(command_line_args, help, kArgDilationH, size_t{1}); } if (o == kArgDilationW) { args.dilation_w = GetArgument(command_line_args, help, kArgDilationW, size_t{1}); } if (o == kArgNumKernels){ args.num_kernels = GetArgument(command_line_args, help, kArgNumKernels, size_t{1}); } } // These are the options common to all routines args.platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); args.device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); args.precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle); #ifdef CLBLAST_REF_CLBLAS args.compare_clblas = GetArgument(command_line_args, help, kArgCompareclblas, 1); #else args.compare_clblas = 0; #endif #ifdef CLBLAST_REF_CBLAS args.compare_cblas = GetArgument(command_line_args, help, kArgComparecblas, 1); #else args.compare_cblas = 0; #endif #ifdef CLBLAST_REF_CUBLAS args.compare_cublas = GetArgument(command_line_args, help, kArgComparecublas, 1); #else args.compare_cublas = 0; #endif args.step = GetArgument(command_line_args, help, kArgStepSize, size_t{1}); args.num_steps = GetArgument(command_line_args, help, kArgNumSteps, size_t{0}); args.num_runs = GetArgument(command_line_args, help, kArgNumRuns, size_t{10}); args.print_help = CheckArgument(command_line_args, help, kArgHelp); args.silent = CheckArgument(command_line_args, help, kArgQuiet); args.no_abbrv = CheckArgument(command_line_args, help, kArgNoAbbreviations); args.full_statistics= CheckArgument(command_line_args, help, kArgFullStatistics); warm_up_ = CheckArgument(command_line_args, help, kArgWarmUp); // Parse the optional JSON file name arguments const auto tuner_files_default = std::string{""}; const auto tuner_files_string = GetArgument(command_line_args, help, kArgTunerFiles, tuner_files_default); if (tuner_files_string != tuner_files_default) { args.tuner_files = split(tuner_files_string, ','); } // Prints the chosen (or defaulted) arguments to screen. This also serves as the help message, // which is thus always displayed (unless silence is specified). if (!args.silent) { fprintf(stdout, "%s\n", help.c_str()); } // Comparison against a non-BLAS routine is not supported if (level == 4) { // level-4 == level-X if (args.compare_clblas != 0 || args.compare_cblas != 0 || args.compare_cublas != 0) { if (!args.silent) { fprintf(stdout, "* Disabling clBLAS/CBLAS/cuBLAS comparisons for this non-BLAS routine\n\n"); } } args.compare_clblas = 0; args.compare_cblas = 0; args.compare_cublas = 0; } // Comparison against other BLAS libraries is not supported in case of half-precision if (args.precision == Precision::kHalf) { if (args.compare_clblas != 0 || args.compare_cblas != 0 || args.compare_cublas != 0) { if (!args.silent) { fprintf(stdout, "* Disabling clBLAS/CBLAS/cuBLAS comparisons for half-precision\n\n"); } } args.compare_clblas = 0; args.compare_cblas = 0; args.compare_cublas = 0; } // Returns the arguments return args; } // ================================================================================================= // This is main performance tester template void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) { // Initializes OpenCL and the libraries auto platform = Platform(args.platform_id); auto device = Device(platform, args.device_id); auto context = Context(device); auto queue = Queue(context, device); #ifdef CLBLAST_REF_CLBLAS if (args.compare_clblas) { clblasSetup(); } #endif #ifdef CLBLAST_REF_CUBLAS if (args.compare_cublas) { cublasSetup(args); } #endif // Optionally overrides parameters if tuner files are given (semicolon separated) OverrideParametersFromJSONFiles(args.tuner_files, device(), args.precision); // Prints the header of the output table PrintTableHeader(args); // Iterates over all "num_step" values jumping by "step" each time auto s = size_t{0}; while(true) { // Sets the buffer sizes (routine-specific) set_sizes(args, queue); // Populates input host matrices with random data std::vector x_source(args.x_size); std::vector y_source(args.y_size); std::vector a_source(args.a_size); std::vector b_source(args.b_size); std::vector c_source(args.c_size); std::vector ap_source(args.ap_size); std::vector scalar_source(args.scalar_size); std::mt19937 mt(kSeed); std::uniform_real_distribution dist(kTestDataLowerLimit, kTestDataUpperLimit); PopulateVector(x_source, mt, dist); PopulateVector(y_source, mt, dist); PopulateVector(a_source, mt, dist); PopulateVector(b_source, mt, dist); PopulateVector(c_source, mt, dist); PopulateVector(ap_source, mt, dist); PopulateVector(scalar_source, mt, dist); // Creates the matrices on the device auto x_vec = Buffer(context, args.x_size); auto y_vec = Buffer(context, args.y_size); auto a_mat = Buffer(context, args.a_size); auto b_mat = Buffer(context, args.b_size); auto c_mat = Buffer(context, args.c_size); auto ap_mat = Buffer(context, args.ap_size); auto scalar = Buffer(context, args.scalar_size); auto scalar_uint = Buffer(context, args.scalar_size); x_vec.Write(queue, args.x_size, x_source); y_vec.Write(queue, args.y_size, y_source); a_mat.Write(queue, args.a_size, a_source); b_mat.Write(queue, args.b_size, b_source); c_mat.Write(queue, args.c_size, c_source); ap_mat.Write(queue, args.ap_size, ap_source); scalar.Write(queue, args.scalar_size, scalar_source); auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar, scalar_uint}; // Runs the routines and collects the timings auto timings = std::vector>(); auto time_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast"); timings.push_back(std::pair("CLBlast", time_clblast)); if (args.compare_clblas) { auto time_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS"); timings.push_back(std::pair("clBLAS", time_clblas)); } if (args.compare_cblas) { auto buffers_host = BuffersHost(); DeviceToHost(args, buffers, buffers_host, queue, buffers_in_); auto time_cblas = TimedExecution(args.num_runs, args, buffers_host, queue, run_reference2_, "CPU BLAS"); HostToDevice(args, buffers, buffers_host, queue, buffers_out_); timings.push_back(std::pair("CPU BLAS", time_cblas)); } if (args.compare_cublas) { auto buffers_host = BuffersHost(); auto buffers_cuda = BuffersCUDA(); DeviceToHost(args, buffers, buffers_host, queue, buffers_in_); HostToCUDA(args, buffers_cuda, buffers_host, buffers_in_); TimeResult time_cublas; try { time_cublas = TimedExecution(args.num_runs, args, buffers_cuda, queue, run_reference3_, "cuBLAS"); } catch (std::runtime_error &e) { } CUDAToHost(args, buffers_cuda, buffers_host, buffers_out_); HostToDevice(args, buffers, buffers_host, queue, buffers_out_); timings.push_back(std::pair("cuBLAS", time_cublas)); } // Prints the performance of the tested libraries PrintTableRow(args, timings); // Makes the jump to the next step ++s; if (s >= args.num_steps) { break; } args.m += args.step; args.n += args.step; args.k += args.step; args.a_ld += args.step; args.b_ld += args.step; args.c_ld += args.step; } // Cleans-up and returns #ifdef CLBLAST_REF_CLBLAS if (args.compare_clblas) { clblasTeardown(); } #endif #ifdef CLBLAST_REF_CUBLAS if (args.compare_cublas) { cublasTeardown(args); } #endif } // ================================================================================================= // Creates a vector of timing results, filled with execution times of the 'main computation'. The // timing is performed using the milliseconds chrono functions. The function returns the minimum // value found in the vector of timing results. The return value is in milliseconds. template template typename Client::TimeResult Client::TimedExecution(const size_t num_runs, const Arguments &args, BufferType &buffers, Queue &queue, RoutineType run_blas, const std::string &library_name) { auto status = StatusCode::kSuccess; // Do an optional warm-up to omit compilation times and initialisations from the measurements if (warm_up_) { try { status = run_blas(args, buffers, queue); } catch (...) { status = static_cast(kUnknownError); } if (status != StatusCode::kSuccess) { throw std::runtime_error(library_name+" error: "+ToString(static_cast(status))); } } // Start the timed part auto timings = std::vector(num_runs); for (auto &timing: timings) { auto start_time = std::chrono::steady_clock::now(); // Executes the main computation try { status = run_blas(args, buffers, queue); } catch (...) { status = static_cast(kUnknownError); } if (status != StatusCode::kSuccess) { throw std::runtime_error(library_name+" error: "+ToString(static_cast(status))); } // Records and stores the end-time auto elapsed_time = std::chrono::steady_clock::now() - start_time; timing = std::chrono::duration(elapsed_time).count(); } // Compute statistics auto result = TimeResult(); const auto sum = std::accumulate(timings.begin(), timings.end(), 0.0); const auto mean = sum / timings.size(); std::vector diff(timings.size()); std::transform(timings.begin(), timings.end(), diff.begin(), [mean](double x) { return x - mean; }); const auto sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); result.mean = mean; result.standard_deviation = std::sqrt(sq_sum / timings.size()); result.minimum = *std::min_element(timings.begin(), timings.end()); result.maximum = *std::max_element(timings.begin(), timings.end()); return result; } // ================================================================================================= // Prints the header of the performance table template void Client::PrintTableHeader(const Arguments& args) { // First line (optional) if (!args.silent) { for (auto i=size_t{0}; i"); if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); } if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); } if (args.compare_cublas) { fprintf(stdout, " | <-- cuBLAS -->"); } } else { fprintf(stdout, " | <-- CLBlast -->"); if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); } if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); } if (args.compare_cublas) { fprintf(stdout, " | <-- cuBLAS -->"); } } fprintf(stdout, " |\n"); } // Second line for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); } if (args.full_statistics) { fprintf(stdout, "%9s;%9s;%9s;%9s", "min_ms_1", "max_ms_1", "mean_1", "stddev_1"); if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s;%9s", "min_ms_2", "max_ms_2", "mean_2", "stddev_2"); } if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s;%9s", "min_ms_3", "max_ms_3", "mean_3", "stddev_3"); } if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s;%9s", "min_ms_4", "max_ms_4", "mean_4", "stddev_4"); } } else { fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1"); if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); } if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); } if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_4", "GFLOPS_4", "GBs_4"); } } fprintf(stdout, "\n"); } // Print a performance-result row template void Client::PrintTableRow(const Arguments& args, const std::vector>& timings) { // Creates a vector of relevant variables auto integers = std::vector{}; for (auto &o: options_) { if (o == kArgM) { integers.push_back(args.m); } else if (o == kArgN) { integers.push_back(args.n); } else if (o == kArgK) { integers.push_back(args.k); } else if (o == kArgKU) { integers.push_back(args.ku); } else if (o == kArgKL) { integers.push_back(args.kl); } else if (o == kArgLayout) { integers.push_back(static_cast(args.layout)); } else if (o == kArgSide) { integers.push_back(static_cast(args.side)); } else if (o == kArgTriangle) { integers.push_back(static_cast(args.triangle)); } else if (o == kArgATransp) { integers.push_back(static_cast(args.a_transpose)); } else if (o == kArgBTransp) { integers.push_back(static_cast(args.b_transpose)); } else if (o == kArgDiagonal) { integers.push_back(static_cast(args.diagonal)); } else if (o == kArgXInc) { integers.push_back(args.x_inc); } else if (o == kArgYInc) { integers.push_back(args.y_inc); } else if (o == kArgXOffset) { integers.push_back(args.x_offset); } else if (o == kArgYOffset) { integers.push_back(args.y_offset); } else if (o == kArgALeadDim) { integers.push_back(args.a_ld); } else if (o == kArgBLeadDim) { integers.push_back(args.b_ld); } else if (o == kArgCLeadDim) { integers.push_back(args.c_ld); } else if (o == kArgAOffset) { integers.push_back(args.a_offset); } else if (o == kArgBOffset) { integers.push_back(args.b_offset); } else if (o == kArgCOffset) { integers.push_back(args.c_offset); } else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); } else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); } else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); } else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); } else if (o == kArgImaxOffset){integers.push_back(args.imax_offset); } else if (o == kArgBatchCount){integers.push_back(args.batch_count); } else if (o == kArgKernelMode){integers.push_back(static_cast(args.kernel_mode)); } else if (o == kArgChannels) {integers.push_back(args.channels); } else if (o == kArgHeight) {integers.push_back(args.height); } else if (o == kArgWidth) {integers.push_back(args.width); } else if (o == kArgKernelH) {integers.push_back(args.kernel_h); } else if (o == kArgKernelW) {integers.push_back(args.kernel_w); } else if (o == kArgPadH) {integers.push_back(args.pad_h); } else if (o == kArgPadW) {integers.push_back(args.pad_w); } else if (o == kArgStrideH) {integers.push_back(args.stride_h); } else if (o == kArgStrideW) {integers.push_back(args.stride_w); } else if (o == kArgDilationH) {integers.push_back(args.dilation_h); } else if (o == kArgDilationW) {integers.push_back(args.dilation_w); } else if (o == kArgNumKernels){integers.push_back(args.num_kernels); } } auto strings = std::vector{}; for (auto &o: options_) { if (o == kArgAlpha) { strings.push_back(ToString(args.alpha)); } else if (o == kArgBeta) { strings.push_back(ToString(args.beta)); } } // Outputs the argument values for (auto &argument: integers) { if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) { fprintf(stdout, "%8zuM;", argument/(1024*1024)); } else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) { fprintf(stdout, "%8zuK;", argument/1024); } else { fprintf(stdout, "%9zu;", argument); } } for (auto &argument: strings) { fprintf(stdout, "%9s;", argument.c_str()); } // Loops over all tested libraries for (const auto& timing : timings) { const auto library_name = timing.first; const auto minimum_ms = timing.second.minimum; if (library_name != "CLBlast") { fprintf(stdout, ";"); } // Either output full statistics if (args.full_statistics) { const auto maximum_ms = timing.second.maximum; const auto mean_ms = timing.second.mean; const auto standard_deviation = timing.second.standard_deviation; fprintf(stdout, "%9.3lf;%9.3lf;%9.3lf;%9.3lf", minimum_ms, maximum_ms, mean_ms, standard_deviation); } // ... or outputs minimum time and the GFLOPS and GB/s metrics else { const auto flops = get_flops_(args); const auto bytes = get_bytes_(args); const auto gflops = (minimum_ms != 0.0) ? (flops*1e-6)/minimum_ms : 0; const auto gbs = (minimum_ms != 0.0) ? (bytes*1e-6)/minimum_ms : 0; fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", minimum_ms, gflops, gbs); } } fprintf(stdout, "\n"); } // ================================================================================================= // Compiles the templated class template class Client; template class Client; template class Client; template class Client; template class Client; template class Client; template class Client; // ================================================================================================= } // namespace clblast CLBlast-1.6.3/test/performance/client.hpp000066400000000000000000000142571463263031500202700ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This class implements the performance-test client. It is generic for all CLBlast routines by // taking a number of routine-specific functions as arguments, such as how to compute buffer sizes // or how to get the FLOPS count. // Typename T: the data-type of the routine's memory buffers (==precision) // Typename U: the data-type of the alpha and beta arguments // // This file also provides the common interface to the performance client (see the 'RunClient' // function for details). // // ================================================================================================= #ifndef CLBLAST_TEST_PERFORMANCE_CLIENT_H_ #define CLBLAST_TEST_PERFORMANCE_CLIENT_H_ #include #include #include #include "test/test_utilities.hpp" // The libraries to test #ifdef CLBLAST_REF_CLBLAS #include #endif #include "test/wrapper_cuda.hpp" #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class Client { public: static const int kSeed; struct TimeResult { double minimum; double maximum; double mean; double standard_deviation; }; // Shorthand for the routine-specific functions passed to the tester using Routine = std::function&, Buffers&, Queue&)>; using Reference1 = std::function&, Buffers&, Queue&)>; using Reference2 = std::function&, BuffersHost&, Queue&)>; using Reference3 = std::function&, BuffersCUDA&, Queue&)>; using SetMetric = std::function&, Queue&)>; using GetMetric = std::function&)>; // The constructor Client(const Routine run_routine, const Reference1 run_reference1, const Reference2 run_reference2, const Reference3 run_reference3, const std::vector &options, const std::vector &buffers_in, const std::vector &buffers_out, const GetMetric get_flops, const GetMetric get_bytes); // Parses all command-line arguments, filling in the arguments structure. If no command-line // argument is given for a particular argument, it is filled in with a default value. Arguments ParseArguments(int argc, char *argv[], const size_t level, const GetMetric default_a_ld, const GetMetric default_b_ld, const GetMetric default_c_ld); // The main client function, setting-up arguments, matrices, OpenCL buffers, etc. After set-up, it // calls the client routines. void PerformanceTest(Arguments &args, const SetMetric set_sizes); private: // Runs a function a given number of times and returns the execution time of the shortest instance template TimeResult TimedExecution(const size_t num_runs, const Arguments &args, BufferType &buffers, Queue &queue, RoutineType run_blas, const std::string &library_name); // Prints the header of a performance-data table void PrintTableHeader(const Arguments& args); // Prints a row of performance data, including results of two libraries void PrintTableRow(const Arguments& args, const std::vector>& timings); // The routine-specific functions passed to the tester const Routine run_routine_; const Reference1 run_reference1_; const Reference2 run_reference2_; const Reference3 run_reference3_; const std::vector options_; const std::vector buffers_in_; const std::vector buffers_out_; const GetMetric get_flops_; const GetMetric get_bytes_; // Extra arguments bool warm_up_; // if enabled, do a warm-up run first before measuring execution time }; // ================================================================================================= // Bogus reference function, in case a comparison library is not available template static StatusCode ReferenceNotAvailable(const Arguments &, BufferType &, Queue &) { return StatusCode::kNotImplemented; } // The interface to the performance client. This is a separate function in the header such that it // is automatically compiled for each routine, templated by the parameter "C". template void RunClient(int argc, char *argv[]) { // Sets the reference to test against #ifdef CLBLAST_REF_CLBLAS auto reference1 = C::RunReference1; // clBLAS when available #else auto reference1 = ReferenceNotAvailable>; #endif #ifdef CLBLAST_REF_CBLAS auto reference2 = C::RunReference2; // CBLAS when available #else auto reference2 = ReferenceNotAvailable>; #endif #ifdef CLBLAST_REF_CUBLAS auto reference3 = C::RunReference3; // cuBLAS when available #else auto reference3 = ReferenceNotAvailable>; #endif // Creates a new client auto client = Client(C::RunRoutine, reference1, reference2, reference3, C::GetOptions(), C::BuffersIn(), C::BuffersOut(), C::GetFlops, C::GetBytes); // Simple command line argument parser with defaults auto args = client.ParseArguments(argc, argv, C::BLASLevel(), C::DefaultLDA, C::DefaultLDB, C::DefaultLDC); if (args.print_help) { return; } // Runs the client client.PerformanceTest(args, C::SetSizes); } // ================================================================================================= } // namespace clblast // CLBLAST_TEST_PERFORMANCE_CLIENT_H_ #endif CLBlast-1.6.3/test/performance/routines/000077500000000000000000000000001463263031500201405ustar00rootroot00000000000000CLBlast-1.6.3/test/performance/routines/level1/000077500000000000000000000000001463263031500213305ustar00rootroot00000000000000CLBlast-1.6.3/test/performance/routines/level1/xamax.cpp000066400000000000000000000032211463263031500231500ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xamax.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xasum.cpp000066400000000000000000000032211463263031500231670ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xasum.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xaxpy.cpp000066400000000000000000000032211463263031500232030ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xaxpy.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xcopy.cpp000066400000000000000000000032211463263031500231740ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xcopy.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xdot.cpp000066400000000000000000000030221463263031500230070ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xdot.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xdotc.cpp000066400000000000000000000030431463263031500231550ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xdotc.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xdotu.cpp000066400000000000000000000030431463263031500231770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xdotu.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xnrm2.cpp000066400000000000000000000032211463263031500231000ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xnrm2.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xrot.cpp000066400000000000000000000027351463263031500230370ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xrot.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xrotg.cpp000066400000000000000000000027401463263031500232020ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xrotg.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xrotm.cpp000066400000000000000000000027401463263031500232100ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xrotm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xrotmg.cpp000066400000000000000000000027431463263031500233620ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xrotmg.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xscal.cpp000066400000000000000000000032211463263031500231440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xscal.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level1/xswap.cpp000066400000000000000000000032211463263031500231740ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level1/xswap.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/000077500000000000000000000000001463263031500213315ustar00rootroot00000000000000CLBlast-1.6.3/test/performance/routines/level2/xgbmv.cpp000066400000000000000000000032211463263031500231560ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xgbmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xgemv.cpp000066400000000000000000000032211463263031500231610ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xgemv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xger.cpp000066400000000000000000000030221463263031500227770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xger.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xgerc.cpp000066400000000000000000000030431463263031500231450ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xgerc.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xgeru.cpp000066400000000000000000000030431463263031500231670ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xgeru.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xhbmv.cpp000066400000000000000000000030431463263031500231610ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xhbmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xhemv.cpp000066400000000000000000000030431463263031500231640ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xhemv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xher.cpp000066400000000000000000000030311463263031500230000ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xher.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, float>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, double>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xher2.cpp000066400000000000000000000030431463263031500230650ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xher2.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xhpmv.cpp000066400000000000000000000030431463263031500231770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xhpmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xhpr.cpp000066400000000000000000000030311463263031500230130ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xhpr.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, float>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, double>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xhpr2.cpp000066400000000000000000000030431463263031500231000ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xhpr2.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xsbmv.cpp000066400000000000000000000030261463263031500231750ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xsbmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xspmv.cpp000066400000000000000000000030261463263031500232130ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xspmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xspr.cpp000066400000000000000000000030221463263031500230260ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xspr.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xspr2.cpp000066400000000000000000000030261463263031500231140ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xspr2.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xsymv.cpp000066400000000000000000000030261463263031500232240ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xsymv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xsyr.cpp000066400000000000000000000030221463263031500230370ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xsyr.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xsyr2.cpp000066400000000000000000000030261463263031500231250ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xsyr2.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xtbmv.cpp000066400000000000000000000032211463263031500231730ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xtbmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xtbsv.cpp000066400000000000000000000031331463263031500232030ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xtbsv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xtpmv.cpp000066400000000000000000000032211463263031500232110ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xtpmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xtpsv.cpp000066400000000000000000000031331463263031500232210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xtpsv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xtrmv.cpp000066400000000000000000000032211463263031500232130ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xtrmv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level2/xtrsv.cpp000066400000000000000000000031331463263031500232230ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level2/xtrsv.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level3/000077500000000000000000000000001463263031500213325ustar00rootroot00000000000000CLBlast-1.6.3/test/performance/routines/level3/xgemm.cpp000066400000000000000000000032401463263031500231520ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level3/xgemm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level3/xhemm.cpp000066400000000000000000000030431463263031500231540ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level3/xhemm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level3/xher2k.cpp000066400000000000000000000030371463263031500232440ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level3/xher2k.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, float>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, double>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level3/xherk.cpp000066400000000000000000000030341463263031500231570ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level3/xherk.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kComplexSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, float>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, double>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level3/xsymm.cpp000066400000000000000000000032211463263031500232110ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level3/xsymm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level3/xsyr2k.cpp000066400000000000000000000032271463263031500233040ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level3/xsyr2k.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level3/xsyrk.cpp000066400000000000000000000032211463263031500232140ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level3/xsyrk.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level3/xtrmm.cpp000066400000000000000000000032211463263031500232030ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level3/xtrmm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/level3/xtrsm.cpp000066400000000000000000000031331463263031500232130ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/level3/xtrsm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/levelx/000077500000000000000000000000001463263031500214375ustar00rootroot00000000000000CLBlast-1.6.3/test/performance/routines/levelx/xaxpybatched.cpp000066400000000000000000000032731463263031500246340ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/levelx/xaxpybatched.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/levelx/xcol2im.cpp000066400000000000000000000032351463263031500235230ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/levelx/xcol2im.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/levelx/xconvgemm.cpp000066400000000000000000000030461463263031500241510ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/levelx/xconvgemm.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/levelx/xgemmbatched.cpp000066400000000000000000000032731463263031500246000ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/levelx/xgemmbatched.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/levelx/xgemmstridedbatched.cpp000066400000000000000000000033451463263031500261570ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/levelx/xgemmstridedbatched.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/levelx/xhad.cpp000066400000000000000000000032131463263031500230660ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/levelx/xhad.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/levelx/xim2col.cpp000066400000000000000000000032351463263031500235230ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/levelx/xim2col.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/levelx/xinvert.cpp000066400000000000000000000032651463263031500236500ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/levelx/xinvert.hpp" // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, float2, float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, double2, double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/performance/routines/levelx/xomatcopy.cpp000066400000000000000000000032511463263031500241670ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // ================================================================================================= #include "test/performance/client.hpp" #include "test/routines/levelx/xomatcopy.hpp" // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: clblast::RunClient, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::RunClient, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::RunClient, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } // ================================================================================================= CLBlast-1.6.3/test/routines/000077500000000000000000000000001463263031500156375ustar00rootroot00000000000000CLBlast-1.6.3/test/routines/common.hpp000066400000000000000000000022021463263031500176340ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains all the common includes for the clients and tests // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_COMMON_H_ #define CLBLAST_TEST_ROUTINES_COMMON_H_ #include #include #include "utilities/utilities.hpp" #include "test/test_utilities.hpp" #ifdef CLBLAST_REF_CLBLAS #include "test/wrapper_clblas.hpp" #endif #ifdef CLBLAST_REF_CBLAS #include "test/wrapper_cblas.hpp" #endif #include "test/wrapper_cuda.hpp" #ifdef CLBLAST_REF_CUBLAS #include "test/wrapper_cublas.hpp" #endif // ================================================================================================= // CLBLAST_TEST_ROUTINES_COMMON_H_ #endif CLBlast-1.6.3/test/routines/level1/000077500000000000000000000000001463263031500170275ustar00rootroot00000000000000CLBlast-1.6.3/test/routines/level1/xamax.hpp000066400000000000000000000155751463263031500206730ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xamax routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XAMAX_H_ #define CLBLAST_TEST_ROUTINES_XAMAX_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXamax { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 1; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgXInc, kArgXOffset, kArgImaxOffset}; } static std::vector BuffersIn() { return {kBufVecX, kBufScalarUint}; } static std::vector BuffersOut() { return {kBufScalarUint}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeImax(const Arguments &args) { return args.imax_offset + 1; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.x_size = GetSizeX(args); args.scalar_size = GetSizeImax(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Amax(args.n, buffers.scalar_uint(), args.imax_offset, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Amax(args.n, buffers.scalar_uint(), args.imax_offset, buffers.x_vec(), args.x_offset, args.x_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXamax(args.n, buffers.scalar_uint, args.imax_offset, buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXamax(args.n, buffers_host.scalar_uint, args.imax_offset, buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXamax(reinterpret_cast(args.cublas_handle), args.n, buffers.scalar_uint, args.imax_offset, buffers.x_vec, args.x_offset, args.x_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result_uint(args.scalar_size, 0); buffers.scalar_uint.Read(queue, args.scalar_size, result_uint); // The result is an integer. However, since the test infrastructure assumes results of // type 'T' (float/double/float2/double2/half), we store the results into T instead. // The values might then become meaningless, but a comparison for testing should still // be valid to verify correctness. auto result_as_T = static_cast(result_uint[0]); std::vector result(args.scalar_size); result[0] = result_as_T; return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { return args.imax_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return args.n; } static size_t GetBytes(const Arguments &args) { return ((args.n) + 1) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XAMAX_H_ #endif CLBlast-1.6.3/test/routines/level1/xasum.hpp000066400000000000000000000146271463263031500207070ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xasum routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XASUM_H_ #define CLBLAST_TEST_ROUTINES_XASUM_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXasum { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 1; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgXInc, kArgXOffset, kArgAsumOffset}; } static std::vector BuffersIn() { return {kBufVecX, kBufScalar}; } static std::vector BuffersOut() { return {kBufScalar}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeAsum(const Arguments &args) { return 1 + args.asum_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.x_size = GetSizeX(args); args.scalar_size = GetSizeAsum(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Asum(args.n, buffers.scalar(), args.asum_offset, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Asum(args.n, buffers.scalar(), args.asum_offset, buffers.x_vec(), args.x_offset, args.x_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXasum(args.n, buffers.scalar, args.asum_offset, buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXasum(args.n, buffers_host.scalar, args.asum_offset, buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXasum(reinterpret_cast(args.cublas_handle), args.n, buffers.scalar, args.asum_offset, buffers.x_vec, args.x_offset, args.x_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.scalar_size, static_cast(0)); buffers.scalar.Read(queue, args.scalar_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { return args.asum_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return args.n; } static size_t GetBytes(const Arguments &args) { return ((args.n) + 1) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XASUM_H_ #endif CLBlast-1.6.3/test/routines/level1/xaxpy.hpp000066400000000000000000000147631463263031500207240ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xaxpy routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XAXPY_H_ #define CLBLAST_TEST_ROUTINES_XAXPY_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXaxpy { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 1; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgXInc, kArgYInc, kArgXOffset, kArgYOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Axpy(args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Axpy(args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXaxpy(args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXaxpy(args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXaxpy(reinterpret_cast(args.cublas_handle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.y_inc + args.y_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n; } static size_t GetBytes(const Arguments &args) { return (3 * args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XAXPY_H_ #endif CLBlast-1.6.3/test/routines/level1/xcopy.hpp000066400000000000000000000147041463263031500207100ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xcopy routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XCOPY_H_ #define CLBLAST_TEST_ROUTINES_XCOPY_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXcopy { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 1; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgXInc, kArgYInc, kArgXOffset, kArgYOffset}; } static std::vector BuffersIn() { return {kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Copy(args.n, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Copy(args.n, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXcopy(args.n, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXcopy(args.n, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXcopy(reinterpret_cast(args.cublas_handle), args.n, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.y_inc + args.y_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 1 * args.n; } static size_t GetBytes(const Arguments &args) { return (2 * args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XCOPY_H_ #endif CLBlast-1.6.3/test/routines/level1/xdot.hpp000066400000000000000000000156201463263031500205220ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xdot routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XDOT_H_ #define CLBLAST_TEST_ROUTINES_XDOT_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXdot { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 1; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgXInc, kArgYInc, kArgXOffset, kArgYOffset, kArgDotOffset}; } static std::vector BuffersIn() { return {kBufVecX, kBufVecY, kBufScalar}; } static std::vector BuffersOut() { return {kBufScalar}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeDot(const Arguments &args) { return 1 + args.dot_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); args.scalar_size = GetSizeDot(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Dot(args.n, buffers.scalar(), args.dot_offset, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Dot(args.n, buffers.scalar(), args.dot_offset, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdot(args.n, buffers.scalar, args.dot_offset, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXdot(args.n, buffers_host.scalar, args.dot_offset, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXdot(reinterpret_cast(args.cublas_handle), args.n, buffers.scalar, args.dot_offset, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.scalar_size, static_cast(0)); buffers.scalar.Read(queue, args.scalar_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { return args.dot_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n; } static size_t GetBytes(const Arguments &args) { return ((2 * args.n) + 1) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XDOT_H_ #endif CLBlast-1.6.3/test/routines/level1/xdotc.hpp000066400000000000000000000156541463263031500206740ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xdotc routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XDOTC_H_ #define CLBLAST_TEST_ROUTINES_XDOTC_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXdotc { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 1; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgXInc, kArgYInc, kArgXOffset, kArgYOffset, kArgDotOffset}; } static std::vector BuffersIn() { return {kBufVecX, kBufVecY, kBufScalar}; } static std::vector BuffersOut() { return {kBufScalar}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeDot(const Arguments &args) { return 1 + args.dot_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); args.scalar_size = GetSizeDot(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotc(args.n, buffers.scalar(), args.dot_offset, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Dotc(args.n, buffers.scalar(), args.dot_offset, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotc(args.n, buffers.scalar, args.dot_offset, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXdotc(args.n, buffers_host.scalar, args.dot_offset, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXdotc(reinterpret_cast(args.cublas_handle), args.n, buffers.scalar, args.dot_offset, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.scalar_size, static_cast(0)); buffers.scalar.Read(queue, args.scalar_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { return args.dot_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n; } static size_t GetBytes(const Arguments &args) { return ((2 * args.n) + 1) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XDOTC_H_ #endif CLBlast-1.6.3/test/routines/level1/xdotu.hpp000066400000000000000000000156541463263031500207160ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xdotu routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XDOTU_H_ #define CLBLAST_TEST_ROUTINES_XDOTU_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXdotu { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 1; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgXInc, kArgYInc, kArgXOffset, kArgYOffset, kArgDotOffset}; } static std::vector BuffersIn() { return {kBufVecX, kBufVecY, kBufScalar}; } static std::vector BuffersOut() { return {kBufScalar}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeDot(const Arguments &args) { return 1 + args.dot_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); args.scalar_size = GetSizeDot(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotu(args.n, buffers.scalar(), args.dot_offset, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Dotu(args.n, buffers.scalar(), args.dot_offset, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotu(args.n, buffers.scalar, args.dot_offset, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXdotu(args.n, buffers_host.scalar, args.dot_offset, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXdotu(reinterpret_cast(args.cublas_handle), args.n, buffers.scalar, args.dot_offset, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.scalar_size, static_cast(0)); buffers.scalar.Read(queue, args.scalar_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { return args.dot_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n; } static size_t GetBytes(const Arguments &args) { return ((2 * args.n) + 1) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XDOTU_H_ #endif CLBlast-1.6.3/test/routines/level1/xnrm2.hpp000066400000000000000000000146371463263031500206210ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xnrm2 routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XNRM2_H_ #define CLBLAST_TEST_ROUTINES_XNRM2_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXnrm2 { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 1; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgXInc, kArgXOffset, kArgNrm2Offset}; } static std::vector BuffersIn() { return {kBufVecX, kBufScalar}; } static std::vector BuffersOut() { return {kBufScalar}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeNrm2(const Arguments &args) { return 1 + args.nrm2_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.x_size = GetSizeX(args); args.scalar_size = GetSizeNrm2(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Nrm2(args.n, buffers.scalar(), args.nrm2_offset, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Nrm2(args.n, buffers.scalar(), args.nrm2_offset, buffers.x_vec(), args.x_offset, args.x_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXnrm2(args.n, buffers.scalar, args.nrm2_offset, buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXnrm2(args.n, buffers_host.scalar, args.nrm2_offset, buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXnrm2(reinterpret_cast(args.cublas_handle), args.n, buffers.scalar, args.nrm2_offset, buffers.x_vec, args.x_offset, args.x_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.scalar_size, static_cast(0)); buffers.scalar.Read(queue, args.scalar_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { return args.nrm2_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n; } static size_t GetBytes(const Arguments &args) { return ((args.n) + 1) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XNRM2_H_ #endif CLBlast-1.6.3/test/routines/level1/xscal.hpp000066400000000000000000000137451463263031500206640ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xscal routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XSCAL_H_ #define CLBLAST_TEST_ROUTINES_XSCAL_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXscal { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 1; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgXInc, kArgXOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufVecX}; } static std::vector BuffersOut() { return {kBufVecX}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.x_size = GetSizeX(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Scal(args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Scal(args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXscal(args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXscal(args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXscal(reinterpret_cast(args.cublas_handle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.x_size, static_cast(0)); buffers.x_vec.Read(queue, args.x_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.x_inc + args.x_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return args.n; } static size_t GetBytes(const Arguments &args) { return (2 * args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XSCAL_H_ #endif CLBlast-1.6.3/test/routines/level1/xswap.hpp000066400000000000000000000151131463263031500207030ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xswap routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XSWAP_H_ #define CLBLAST_TEST_ROUTINES_XSWAP_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXswap { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 1; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgXInc, kArgYInc, kArgXOffset, kArgYOffset}; } static std::vector BuffersIn() { return {kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufVecX, kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Swap(args.n, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Swap(args.n, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXswap(args.n, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXswap(args.n, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXswap(reinterpret_cast(args.cublas_handle), args.n, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.x_size + args.y_size, static_cast(0)); buffers.x_vec.Read(queue, args.x_size, &result[0]); buffers.y_vec.Read(queue, args.y_size, &result[args.x_size]); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 2; } // x_vec and y_vec static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return (id2 == 0) ? id1*args.x_inc + args.x_offset : id1*args.y_inc + args.y_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return args.n; } static size_t GetBytes(const Arguments &args) { return (2 * args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XSWAP_H_ #endif CLBlast-1.6.3/test/routines/level2/000077500000000000000000000000001463263031500170305ustar00rootroot00000000000000CLBlast-1.6.3/test/routines/level2/xgbmv.hpp000066400000000000000000000204071463263031500206670ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xgbmv routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XGBMV_H_ #define CLBLAST_TEST_ROUTINES_XGBMV_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXgbmv { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgKL, kArgKU, kArgLayout, kArgATransp, kArgALeadDim, kArgXInc, kArgYInc, kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { auto a_transposed = (args.a_transpose != Transpose::kNo); auto n_real = (a_transposed) ? args.m : args.n; return n_real * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { auto a_transposed = (args.a_transpose != Transpose::kNo); auto m_real = (a_transposed) ? args.n : args.m; return m_real * args.y_inc + args.y_offset; } static size_t GetSizeA(const Arguments &args) { auto a_rotated = (args.layout == Layout::kRowMajor); auto a_two = (a_rotated) ? args.m : args.n; return a_two * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &all) { return all; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Gbmv(args.layout, args.a_transpose, args.m, args.n, args.kl, args.ku, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Gbmv(args.layout, args.a_transpose, args.m, args.n, args.kl, args.ku, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), args.m, args.n, args.kl, args.ku, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXgbmv(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), args.m, args.n, args.kl, args.ku, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXgbmv(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.a_transpose), args.m, args.n, args.kl, args.ku, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { auto a_transposed = (args.a_transpose != Transpose::kNo); return (a_transposed) ? args.n : args.m; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.y_inc + args.y_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.m * args.n; } static size_t GetBytes(const Arguments &args) { auto a_rotated = (args.layout == Layout::kRowMajor); auto a_one = (a_rotated) ? args.n : args.m; auto a_two = (a_rotated) ? args.m : args.n; return ((args.kl+args.ku+1)*a_two + 2*a_one + a_two) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XGBMV_H_ #endif CLBlast-1.6.3/test/routines/level2/xgemv.hpp000066400000000000000000000177711463263031500207040ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xgemv routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XGEMV_H_ #define CLBLAST_TEST_ROUTINES_XGEMV_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXgemv { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgLayout, kArgATransp, kArgALeadDim, kArgXInc, kArgYInc, kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { auto a_transposed = (args.a_transpose != Transpose::kNo); auto n_real = (a_transposed) ? args.m : args.n; return n_real * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { auto a_transposed = (args.a_transpose != Transpose::kNo); auto m_real = (a_transposed) ? args.n : args.m; return m_real * args.y_inc + args.y_offset; } static size_t GetSizeA(const Arguments &args) { auto a_rotated = (args.layout == Layout::kRowMajor); auto a_two = (a_rotated) ? args.m : args.n; return a_two * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &all) { return all; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgemv(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), args.m, args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXgemv(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), args.m, args.n, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXgemv(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.a_transpose), args.m, args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { auto a_transposed = (args.a_transpose != Transpose::kNo); return (a_transposed) ? args.n : args.m; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.y_inc + args.y_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.m * args.n; } static size_t GetBytes(const Arguments &args) { return (args.m*args.n + 2*args.m + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XGEMV_H_ #endif CLBlast-1.6.3/test/routines/level2/xger.hpp000066400000000000000000000167311463263031500205160ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xger routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XGER_H_ #define CLBLAST_TEST_ROUTINES_XGER_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXger { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgLayout, kArgALeadDim, kArgXInc, kArgYInc, kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.m * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeA(const Arguments &args) { auto a_rotated = (args.layout == Layout::kRowMajor); auto a_two = (a_rotated) ? args.m : args.n; return a_two * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Ger(args.layout, args.m, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Ger(args.layout, args.m, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXger(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXger(convertToCBLAS(args.layout), args.m, args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc, buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXger(reinterpret_cast(args.cublas_handle), args.layout, args.m, args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.a_mat, args.a_offset, args.a_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.a_size, static_cast(0)); buffers.a_mat.Read(queue, args.a_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.m; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return (args.layout == Layout::kRowMajor) ? id1*args.a_ld + id2 + args.a_offset: id2*args.a_ld + id1 + args.a_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.m * args.n; } static size_t GetBytes(const Arguments &args) { return (2*args.m*args.n + args.m + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XGER_H_ #endif CLBlast-1.6.3/test/routines/level2/xgerc.hpp000066400000000000000000000167721463263031500206660ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xgerc routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XGERC_H_ #define CLBLAST_TEST_ROUTINES_XGERC_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXgerc { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgLayout, kArgALeadDim, kArgXInc, kArgYInc, kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.m * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeA(const Arguments &args) { auto a_rotated = (args.layout == Layout::kRowMajor); auto a_two = (a_rotated) ? args.m : args.n; return a_two * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Gerc(args.layout, args.m, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Gerc(args.layout, args.m, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgerc(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXgerc(convertToCBLAS(args.layout), args.m, args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc, buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXgerc(reinterpret_cast(args.cublas_handle), args.layout, args.m, args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.a_mat, args.a_offset, args.a_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.a_size, static_cast(0)); buffers.a_mat.Read(queue, args.a_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.m; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return (args.layout == Layout::kRowMajor) ? id1*args.a_ld + id2 + args.a_offset: id2*args.a_ld + id1 + args.a_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.m * args.n; } static size_t GetBytes(const Arguments &args) { return (2*args.m*args.n + args.m + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XGERC_H_ #endif CLBlast-1.6.3/test/routines/level2/xgeru.hpp000066400000000000000000000167721463263031500207100ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xgeru routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XGERU_H_ #define CLBLAST_TEST_ROUTINES_XGERU_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXgeru { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgLayout, kArgALeadDim, kArgXInc, kArgYInc, kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.m * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeA(const Arguments &args) { auto a_rotated = (args.layout == Layout::kRowMajor); auto a_two = (a_rotated) ? args.m : args.n; return a_two * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Geru(args.layout, args.m, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Geru(args.layout, args.m, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgeru(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXgeru(convertToCBLAS(args.layout), args.m, args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc, buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXgeru(reinterpret_cast(args.cublas_handle), args.layout, args.m, args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.a_mat, args.a_offset, args.a_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.a_size, static_cast(0)); buffers.a_mat.Read(queue, args.a_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.m; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return (args.layout == Layout::kRowMajor) ? id1*args.a_ld + id2 + args.a_offset: id2*args.a_ld + id1 + args.a_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.m * args.n; } static size_t GetBytes(const Arguments &args) { return (2*args.m*args.n + args.m + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XGERU_H_ #endif CLBlast-1.6.3/test/routines/level2/xhbmv.hpp000066400000000000000000000171561463263031500206770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xhbmv routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XHBMV_H_ #define CLBLAST_TEST_ROUTINES_XHBMV_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXhbmv { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgKL, kArgLayout, kArgTriangle, kArgALeadDim, kArgXInc, kArgYInc, kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeA(const Arguments &args) { return args.n * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Hbmv(args.layout, args.triangle, args.n, args.kl, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Hbmv(args.layout, args.triangle, args.n, args.kl, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXhbmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.kl, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXhbmv(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.kl, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.y_inc + args.y_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XHBMV_H_ #endif CLBlast-1.6.3/test/routines/level2/xhemv.hpp000066400000000000000000000170541463263031500206770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xhemv routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XHEMV_H_ #define CLBLAST_TEST_ROUTINES_XHEMV_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXhemv { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgALeadDim, kArgXInc, kArgYInc, kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeA(const Arguments &args) { return args.n * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemv(args.layout, args.triangle, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Hemv(args.layout, args.triangle, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhemv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXhemv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXhemv(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.y_inc + args.y_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return (args.n*args.n + 2*args.n + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XHEMV_H_ #endif CLBlast-1.6.3/test/routines/level2/xher.hpp000066400000000000000000000156661463263031500205250ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xher routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XHER_H_ #define CLBLAST_TEST_ROUTINES_XHER_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXher { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgALeadDim, kArgXInc, kArgAOffset, kArgXOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX}; } static std::vector BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeA(const Arguments &args) { return args.n * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Her(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Her(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.a_mat(), args.a_offset, args.a_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXher(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue&) { cblasXher(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXher(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.a_mat, args.a_offset, args.a_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.a_size, static_cast(0)); buffers.a_mat.Read(queue, args.a_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return id2*args.a_ld + id1 + args.a_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 3 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return (args.n*args.n + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XHER_H_ #endif CLBlast-1.6.3/test/routines/level2/xher2.hpp000066400000000000000000000167301463263031500206000ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xher2 routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XHER2_H_ #define CLBLAST_TEST_ROUTINES_XHER2_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXher2 { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgALeadDim, kArgXInc, kArgYInc, kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeA(const Arguments &args) { return args.n * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Her2(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Her2(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXher2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXher2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc, buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXher2(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.a_mat, args.a_offset, args.a_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.a_size, static_cast(0)); buffers.a_mat.Read(queue, args.a_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return id2*args.a_ld + id1 + args.a_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 5 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return (args.n*args.n + 2 * args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XHER2_H_ #endif CLBlast-1.6.3/test/routines/level2/xhpmv.hpp000066400000000000000000000170311463263031500207050ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xhpmv routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XHPMV_H_ #define CLBLAST_TEST_ROUTINES_XHPMV_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXhpmv { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgXInc, kArgYInc, kArgAPOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeAP(const Arguments &args) { return ((args.n*(args.n+1)) / 2) + args.ap_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.ap_size = GetSizeAP(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpmv(args.layout, args.triangle, args.n, args.alpha, buffers.ap_mat(), args.ap_offset, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Hpmv(args.layout, args.triangle, args.n, args.alpha, buffers.ap_mat(), args.ap_offset, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhpmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.ap_mat, args.ap_offset, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXhpmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, buffers_host.ap_mat, args.ap_offset, buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXhpmv(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.alpha, buffers.ap_mat, args.ap_offset, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.y_inc + args.y_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XHPMV_H_ #endif CLBlast-1.6.3/test/routines/level2/xhpr.hpp000066400000000000000000000156621463263031500205340ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xhpr routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XHPR_H_ #define CLBLAST_TEST_ROUTINES_XHPR_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXhpr { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgXInc, kArgAPOffset, kArgXOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatAP, kBufVecX}; } static std::vector BuffersOut() { return {kBufMatAP}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeAP(const Arguments &args) { return ((args.n*(args.n+1)) / 2) + args.ap_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.ap_size = GetSizeAP(args); args.x_size = GetSizeX(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpr(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.ap_mat(), args.ap_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Hpr(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.ap_mat(), args.ap_offset, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhpr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue&) { cblasXhpr(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.ap_mat, args.ap_offset); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXhpr(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.ap_mat, args.ap_offset); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.ap_size, static_cast(0)); buffers.ap_mat.Read(queue, args.ap_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return GetSizeAP(args) - args.ap_offset; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1 + args.ap_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 3 * ((args.n*(args.n+1)) / 2); } static size_t GetBytes(const Arguments &args) { return ((args.n*(args.n+1)) + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XHPR_H_ #endif CLBlast-1.6.3/test/routines/level2/xhpr2.hpp000066400000000000000000000167241463263031500206160ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xhpr2 routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XHPR2_H_ #define CLBLAST_TEST_ROUTINES_XHPR2_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXhpr2 { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgXInc, kArgYInc, kArgAPOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufMatAP}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeAP(const Arguments &args) { return ((args.n*(args.n+1)) / 2) + args.ap_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.ap_size = GetSizeAP(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpr2(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.ap_mat(), args.ap_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Hpr2(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.ap_mat(), args.ap_offset, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhpr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXhpr2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc, buffers_host.ap_mat, args.ap_offset); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXhpr2(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.ap_mat, args.ap_offset); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.ap_size, static_cast(0)); buffers.ap_mat.Read(queue, args.ap_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return GetSizeAP(args) - args.ap_offset; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1 + args.ap_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 5 * ((args.n*(args.n+1)) / 2); } static size_t GetBytes(const Arguments &args) { return ((args.n*(args.n+1)) + 2 * args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XHPR2_H_ #endif CLBlast-1.6.3/test/routines/level2/xsbmv.hpp000066400000000000000000000171561463263031500207120ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xsbmv routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XSBMV_H_ #define CLBLAST_TEST_ROUTINES_XSBMV_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXsbmv { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgKL, kArgLayout, kArgTriangle, kArgALeadDim, kArgXInc, kArgYInc, kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeA(const Arguments &args) { return args.n * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Sbmv(args.layout, args.triangle, args.n, args.kl, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Sbmv(args.layout, args.triangle, args.n, args.kl, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXsbmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.kl, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXsbmv(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.kl, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.y_inc + args.y_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XSBMV_H_ #endif CLBlast-1.6.3/test/routines/level2/xspmv.hpp000066400000000000000000000170311463263031500207200ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xspmv routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XSPMV_H_ #define CLBLAST_TEST_ROUTINES_XSPMV_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXspmv { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgXInc, kArgYInc, kArgAPOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeAP(const Arguments &args) { return ((args.n*(args.n+1)) / 2) + args.ap_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.ap_size = GetSizeAP(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Spmv(args.layout, args.triangle, args.n, args.alpha, buffers.ap_mat(), args.ap_offset, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Spmv(args.layout, args.triangle, args.n, args.alpha, buffers.ap_mat(), args.ap_offset, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXspmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.ap_mat, args.ap_offset, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXspmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, buffers_host.ap_mat, args.ap_offset, buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXspmv(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.alpha, buffers.ap_mat, args.ap_offset, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.y_inc + args.y_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XSPMV_H_ #endif CLBlast-1.6.3/test/routines/level2/xspr.hpp000066400000000000000000000156471463263031500205520ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xspr routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XSPR_H_ #define CLBLAST_TEST_ROUTINES_XSPR_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXspr { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgXInc, kArgAPOffset, kArgXOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatAP, kBufVecX}; } static std::vector BuffersOut() { return {kBufMatAP}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeAP(const Arguments &args) { return ((args.n*(args.n+1)) / 2) + args.ap_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.ap_size = GetSizeAP(args); args.x_size = GetSizeX(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Spr(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.ap_mat(), args.ap_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Spr(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.ap_mat(), args.ap_offset, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXspr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXspr(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.ap_mat, args.ap_offset); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXspr(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.ap_mat, args.ap_offset); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.ap_size, static_cast(0)); buffers.ap_mat.Read(queue, args.ap_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return GetSizeAP(args) - args.ap_offset; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1 + args.ap_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 3 * ((args.n*(args.n+1)) / 2); } static size_t GetBytes(const Arguments &args) { return ((args.n*(args.n+1)) + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XSPR_H_ #endif CLBlast-1.6.3/test/routines/level2/xspr2.hpp000066400000000000000000000167241463263031500206310ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xspr2 routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XSPR2_H_ #define CLBLAST_TEST_ROUTINES_XSPR2_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXspr2 { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgXInc, kArgYInc, kArgAPOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufMatAP}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeAP(const Arguments &args) { return ((args.n*(args.n+1)) / 2) + args.ap_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.ap_size = GetSizeAP(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Spr2(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.ap_mat(), args.ap_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Spr2(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.ap_mat(), args.ap_offset, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXspr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXspr2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc, buffers_host.ap_mat, args.ap_offset); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXspr2(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.ap_mat, args.ap_offset); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.ap_size, static_cast(0)); buffers.ap_mat.Read(queue, args.ap_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return GetSizeAP(args) - args.ap_offset; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1 + args.ap_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 5 * ((args.n*(args.n+1)) / 2); } static size_t GetBytes(const Arguments &args) { return ((args.n*(args.n+1)) + 2 * args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XSPR2_H_ #endif CLBlast-1.6.3/test/routines/level2/xsymv.hpp000066400000000000000000000170541463263031500207360ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xsymv routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XSYMV_H_ #define CLBLAST_TEST_ROUTINES_XSYMV_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXsymv { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgALeadDim, kArgXInc, kArgYInc, kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeA(const Arguments &args) { return args.n * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Symv(args.layout, args.triangle, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Symv(args.layout, args.triangle, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsymv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXsymv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXsymv(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, args.beta, buffers.y_vec, args.y_offset, args.y_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.y_inc + args.y_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return (args.n*args.n + 2*args.n + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XSYMV_H_ #endif CLBlast-1.6.3/test/routines/level2/xsyr.hpp000066400000000000000000000156531463263031500205600ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xsyr routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XSYR_H_ #define CLBLAST_TEST_ROUTINES_XSYR_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXsyr { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgALeadDim, kArgXInc, kArgAOffset, kArgXOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX}; } static std::vector BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeA(const Arguments &args) { return args.n * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Syr(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.a_mat(), args.a_offset, args.a_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXsyr(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXsyr(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.a_mat, args.a_offset, args.a_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.a_size, static_cast(0)); buffers.a_mat.Read(queue, args.a_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return id2*args.a_ld + id1 + args.a_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 3 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return (args.n*args.n + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XSYR_H_ #endif CLBlast-1.6.3/test/routines/level2/xsyr2.hpp000066400000000000000000000167301463263031500206370ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xsyr2 routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XSYR2_H_ #define CLBLAST_TEST_ROUTINES_XSYR2_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXsyr2 { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgALeadDim, kArgXInc, kArgYInc, kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeA(const Arguments &args) { return args.n * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Syr2(args.layout, args.triangle, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXsyr2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, buffers_host.x_vec, args.x_offset, args.x_inc, buffers_host.y_vec, args.y_offset, args.y_inc, buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXsyr2(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), args.n, args.alpha, buffers.x_vec, args.x_offset, args.x_inc, buffers.y_vec, args.y_offset, args.y_inc, buffers.a_mat, args.a_offset, args.a_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.a_size, static_cast(0)); buffers.a_mat.Read(queue, args.a_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return id2*args.a_ld + id1 + args.a_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 5 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return (args.n*args.n + 2 * args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XSYR2_H_ #endif CLBlast-1.6.3/test/routines/level2/xtbmv.hpp000066400000000000000000000166701463263031500207130ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xtbmv routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XTBMV_H_ #define CLBLAST_TEST_ROUTINES_XTBMV_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXtbmv { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgKL, kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal, kArgALeadDim, kArgXInc, kArgAOffset, kArgXOffset}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX}; } static std::vector BuffersOut() { return {kBufVecX}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeA(const Arguments &args) { return args.n * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &all) { return all; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Tbmv(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, args.kl, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Tbmv(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, args.kl, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, args.kl, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXtbmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), convertToCBLAS(args.diagonal), args.n, args.kl, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXtbmv(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), convertToCUBLAS(args.a_transpose), convertToCUBLAS(args.diagonal), args.n, args.kl, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.x_size, static_cast(0)); buffers.x_vec.Read(queue, args.x_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.x_inc + args.x_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XTBMV_H_ #endif CLBlast-1.6.3/test/routines/level2/xtpmv.hpp000066400000000000000000000165241463263031500207270ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xtpmv routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XTPMV_H_ #define CLBLAST_TEST_ROUTINES_XTPMV_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXtpmv { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal, kArgXInc, kArgAPOffset, kArgXOffset}; } static std::vector BuffersIn() { return {kBufMatAP, kBufVecX}; } static std::vector BuffersOut() { return {kBufVecX}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeAP(const Arguments &args) { return ((args.n*(args.n+1)) / 2) + args.ap_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.ap_size = GetSizeAP(args); args.x_size = GetSizeX(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &all) { return all; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Tpmv(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, buffers.ap_mat(), args.ap_offset, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Tpmv(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, buffers.ap_mat(), args.ap_offset, buffers.x_vec(), args.x_offset, args.x_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtpmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, buffers.ap_mat, args.ap_offset, buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXtpmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), convertToCBLAS(args.diagonal), args.n, buffers_host.ap_mat, args.ap_offset, buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXtpmv(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), convertToCUBLAS(args.a_transpose), convertToCUBLAS(args.diagonal), args.n, buffers.ap_mat, args.ap_offset, buffers.x_vec, args.x_offset, args.x_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.x_size, static_cast(0)); buffers.x_vec.Read(queue, args.x_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.x_inc + args.x_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XTPMV_H_ #endif CLBlast-1.6.3/test/routines/level2/xtrmv.hpp000066400000000000000000000165661463263031500207370ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xtrmv routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XTRMV_H_ #define CLBLAST_TEST_ROUTINES_XTRMV_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXtrmv { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal, kArgALeadDim, kArgXInc, kArgAOffset, kArgXOffset}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX}; } static std::vector BuffersOut() { return {kBufVecX}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeA(const Arguments &args) { return args.n * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &all) { return all; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmv(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Trmv(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXtrmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), convertToCBLAS(args.diagonal), args.n, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXtrmv(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), convertToCUBLAS(args.a_transpose), convertToCUBLAS(args.diagonal), args.n, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.x_size, static_cast(0)); buffers.x_vec.Read(queue, args.x_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.x_inc + args.x_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return (args.n*args.n + 2*args.n + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XTRMV_H_ #endif CLBlast-1.6.3/test/routines/level2/xtrsv.hpp000066400000000000000000000201301463263031500207230ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xtrsv routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XTRSV_H_ #define CLBLAST_TEST_ROUTINES_XTRSV_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXtrsv { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 2; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal, kArgALeadDim, kArgXInc, kArgAOffset, kArgXOffset}; } static std::vector BuffersIn() { return {kBufMatA, kBufVecX}; } static std::vector BuffersOut() { return {kBufVecX}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeA(const Arguments &args) { return args.n * args.a_ld + args.a_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.x_size = GetSizeX(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &all) { return all; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments &args, Queue&, const int, std::vector &x_source, std::vector&, std::vector &a_source, std::vector&, std::vector&, std::vector&, std::vector&) { if (args.a_ld < args.n) { return; } if (args.a_size <= 0 || args.x_size <= 0) { return; } // Generates 'proper' input for the TRSV routine // TODO: Improve this, currently loosely based on clBLAS's implementation for (auto i = size_t{0}; i < args.n; ++i) { auto diagonal = a_source[i*args.a_ld + i + args.a_offset]; diagonal = static_cast(AbsoluteValue(diagonal)) + Constant(static_cast(args.n / size_t{4})); for (auto j = size_t{0}; j < args.n; ++j) { a_source[j*args.a_ld + i + args.a_offset] /= Constant(2.0); } a_source[i*args.a_ld + i + args.a_offset] = diagonal; x_source[i * args.x_inc + args.x_offset] /= Constant(2.0); } } // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Trsv(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Trsv(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrsv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXtrsv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), convertToCBLAS(args.diagonal), args.n, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXtrsv(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), convertToCUBLAS(args.a_transpose), convertToCUBLAS(args.diagonal), args.n, buffers.a_mat, args.a_offset, args.a_ld, buffers.x_vec, args.x_offset, args.x_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.x_size, static_cast(0)); buffers.x_vec.Read(queue, args.x_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { return id1*args.x_inc + args.x_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n * args.n; } static size_t GetBytes(const Arguments &args) { return (args.n*args.n + 2*args.n + args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XTRSV_H_ #endif CLBlast-1.6.3/test/routines/level3/000077500000000000000000000000001463263031500170315ustar00rootroot00000000000000CLBlast-1.6.3/test/routines/level3/xgemm.hpp000066400000000000000000000244151463263031500206650ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xgemm routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XGEMM_H_ #define CLBLAST_TEST_ROUTINES_XGEMM_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template // 'V' is the version of the kernel (0 for default, 1 for 'in-direct', 2 for 'direct') class TestXgemm { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 3; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgK, kArgLayout, kArgATransp, kArgBTransp, kArgALeadDim, kArgBLeadDim, kArgCLeadDim, kArgAOffset, kArgBOffset, kArgCOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC, kBufMatAP}; } // used as temp buffer static std::vector BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); auto a_two = (a_rotated) ? args.m : args.k; return a_two * args.a_ld + args.a_offset; } static size_t GetSizeB(const Arguments &args) { auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo); auto b_two = (b_rotated) ? args.k : args.n; return b_two * args.b_ld + args.b_offset; } static size_t GetSizeC(const Arguments &args) { auto c_rotated = (args.layout == Layout::kRowMajor); auto c_two = (c_rotated) ? args.m : args.n; return c_two * args.c_ld + args.c_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue &queue) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); args.c_size = GetSizeC(args); // Optionally (V != 0) enforces indirect (V == 1) or direct (V == 2) kernels if (V != 0) { const auto device = queue.GetDevice(); const auto switch_threshold = (V == 1) ? size_t{0} : size_t{4096}; // large enough for tests const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue(), {{"XGEMM_MIN_INDIRECT_SIZE", switch_threshold}}); if (override_status != StatusCode::kSuccess) { } } // Sets the size of the temporary buffer (optional argument to GEMM) auto temp_buffer_size = size_t{0}; #ifdef OPENCL_API auto queue_plain = queue(); GemmTempBufferSize(args.layout, args.a_transpose, args.b_transpose, args.m, args.n, args.k, args.a_offset, args.a_ld, args.b_offset, args.b_ld, args.c_offset, args.c_ld, &queue_plain, temp_buffer_size); #elif CUDA_API GemmTempBufferSize(args.layout, args.a_transpose, args.b_transpose, args.m, args.n, args.k, args.a_offset, args.a_ld, args.b_offset, args.b_ld, args.c_offset, args.c_ld, queue.GetDevice()(), temp_buffer_size); #endif args.ap_size = (temp_buffer_size + sizeof(T)) / sizeof(T); // + sizeof(T) to prevent zero } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.k; } static size_t DefaultLDB(const Arguments &args) { return args.n; } static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &all) { return all; } static Transposes GetBTransposes(const Transposes &all) { return all; } // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, args.m, args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event, buffers.ap_mat()); // temp buffer if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, args.m, args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, queue.GetContext()(), queue.GetDevice()(), buffers.ap_mat()); // temp buffer cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgemm(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXgemm(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), convertToCBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.b_mat, args.b_offset, args.b_ld, args.beta, buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXgemm(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.a_transpose), convertToCUBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.m; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return (args.layout == Layout::kRowMajor) ? id1*args.c_ld + id2 + args.c_offset: id2*args.c_ld + id1 + args.c_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { if((args.precision == Precision::kComplexSingle) || (args.precision == Precision::kComplexDouble)) { // complex flops return args.m * args.n * (8 * args.k - 2); } else { // scalar flops return args.m * args.n * (2 * args.k - 1); } } static size_t GetBytes(const Arguments &args) { return (args.m*args.k + args.k*args.n + 2*args.m*args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XGEMM_H_ #endif CLBlast-1.6.3/test/routines/level3/xhemm.hpp000066400000000000000000000204501463263031500206610ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xhemm routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XHEMM_H_ #define CLBLAST_TEST_ROUTINES_XHEMM_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXhemm { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 3; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgLayout, kArgSide, kArgTriangle, kArgALeadDim, kArgBLeadDim, kArgCLeadDim, kArgAOffset, kArgBOffset, kArgCOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } static std::vector BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { size_t k_value = (args.side == Side::kLeft) ? args.m : args.n; auto a_rotated = (args.layout == Layout::kRowMajor); auto a_two = (a_rotated) ? args.m : k_value; return a_two * args.a_ld + args.a_offset; } static size_t GetSizeB(const Arguments &args) { size_t k_value = (args.side == Side::kLeft) ? args.m : args.n; auto b_rotated = (args.layout == Layout::kRowMajor); auto b_two = (b_rotated) ? k_value : args.n; return b_two * args.b_ld + args.b_offset; } static size_t GetSizeC(const Arguments &args) { auto c_rotated = (args.layout == Layout::kRowMajor); auto c_two = (c_rotated) ? args.m : args.n; return c_two * args.c_ld + args.c_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); args.c_size = GetSizeC(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.m; } static size_t DefaultLDB(const Arguments &args) { return args.n; } static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemm(args.layout, args.side, args.triangle, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Hemm(args.layout, args.side, args.triangle, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhemm(convertToCLBLAS(args.layout), convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXhemm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), args.m, args.n, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.b_mat, args.b_offset, args.b_ld, args.beta, buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXhemm(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.side), convertToCUBLAS(args.triangle), args.m, args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.m; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return (args.layout == Layout::kRowMajor) ? id1*args.c_ld + id2 + args.c_offset: id2*args.c_ld + id1 + args.c_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.m * args.n * args.m; } static size_t GetBytes(const Arguments &args) { return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XHEMM_H_ #endif CLBlast-1.6.3/test/routines/level3/xher2k.hpp000066400000000000000000000207371463263031500207560ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xher2k routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XHER2K_H_ #define CLBLAST_TEST_ROUTINES_XHER2K_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXher2k { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 3; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgK, kArgLayout, kArgTriangle, kArgATransp, kArgALeadDim, kArgBLeadDim, kArgCLeadDim, kArgAOffset, kArgBOffset, kArgCOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } static std::vector BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); auto a_two = (a_rotated) ? args.n : args.k; return a_two * args.a_ld + args.a_offset; } static size_t GetSizeB(const Arguments &args) { auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); auto b_two = (b_rotated) ? args.n : args.k; return b_two * args.b_ld + args.b_offset; } static size_t GetSizeC(const Arguments &args) { return args.n * args.c_ld + args.c_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); args.c_size = GetSizeC(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.k; } static size_t DefaultLDB(const Arguments &args) { return args.k; } static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {Transpose::kNo, Transpose::kConjugate}; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto alpha2 = T{args.alpha, args.alpha}; #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Her2k(args.layout, args.triangle, args.a_transpose, args.n, args.k, alpha2, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Her2k(args.layout, args.triangle, args.a_transpose, args.n, args.k, alpha2, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; auto status = clblasXher2k(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, alpha2, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue&) { auto alpha2 = T{args.alpha, args.alpha}; cblasXher2k(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), args.n, args.k, alpha2, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.b_mat, args.b_offset, args.b_ld, args.beta, buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto alpha2 = T{args.alpha, args.alpha}; auto status = cublasXher2k(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), convertToCUBLAS(args.a_transpose), args.n, args.k, alpha2, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return id1*args.c_ld + id2 + args.c_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n * args.n * args.k; } static size_t GetBytes(const Arguments &args) { return (args.n*args.k + args.n*args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XHER2K_H_ #endif CLBlast-1.6.3/test/routines/level3/xherk.hpp000066400000000000000000000170461463263031500206730ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xherk routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XHERK_H_ #define CLBLAST_TEST_ROUTINES_XHERK_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXherk { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 3; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgK, kArgLayout, kArgTriangle, kArgATransp, kArgALeadDim, kArgCLeadDim, kArgAOffset, kArgCOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatC}; } static std::vector BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); auto a_two = (a_rotated) ? args.n : args.k; return a_two * args.a_ld + args.a_offset; } static size_t GetSizeC(const Arguments &args) { return args.n * args.c_ld + args.c_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.c_size = GetSizeC(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.k; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {Transpose::kNo, Transpose::kConjugate}; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Herk(args.layout, args.triangle, args.a_transpose, args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Herk(args.layout, args.triangle, args.a_transpose, args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXherk(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue&) { cblasXherk(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), args.n, args.k, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, args.beta, buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXherk(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), convertToCUBLAS(args.a_transpose), args.n, args.k, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return id1*args.c_ld + id2 + args.c_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return args.n * args.n * args.k; } static size_t GetBytes(const Arguments &args) { return (args.n*args.k + args.n*args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XHERK_H_ #endif CLBlast-1.6.3/test/routines/level3/xsymm.hpp000066400000000000000000000207771463263031500207340ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xsymm routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XSYMM_H_ #define CLBLAST_TEST_ROUTINES_XSYMM_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXsymm { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 3; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgLayout, kArgSide, kArgTriangle, kArgALeadDim, kArgBLeadDim, kArgCLeadDim, kArgAOffset, kArgBOffset, kArgCOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } static std::vector BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { size_t k_value = (args.side == Side::kLeft) ? args.m : args.n; auto a_rotated = (args.layout == Layout::kRowMajor); auto a_two = (a_rotated) ? args.m : k_value; return a_two * args.a_ld + args.a_offset; } static size_t GetSizeB(const Arguments &args) { size_t k_value = (args.side == Side::kLeft) ? args.m : args.n; auto b_rotated = (args.layout == Layout::kRowMajor); auto b_two = (b_rotated) ? k_value : args.n; return b_two * args.b_ld + args.b_offset; } static size_t GetSizeC(const Arguments &args) { auto c_rotated = (args.layout == Layout::kRowMajor); auto c_two = (c_rotated) ? args.m : args.n; return c_two * args.c_ld + args.c_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); args.c_size = GetSizeC(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.m; } static size_t DefaultLDB(const Arguments &args) { return args.n; } static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Symm(args.layout, args.side, args.triangle, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Symm(args.layout, args.side, args.triangle, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsymm(convertToCLBLAS(args.layout), convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXsymm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), args.m, args.n, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.b_mat, args.b_offset, args.b_ld, args.beta, buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXsymm(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.side), convertToCUBLAS(args.triangle), args.m, args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.m; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return (args.layout == Layout::kRowMajor) ? id1*args.c_ld + id2 + args.c_offset: id2*args.c_ld + id1 + args.c_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { if((args.precision == Precision::kComplexSingle) || (args.precision == Precision::kComplexDouble)) { // complex flops return 8 * args.m * args.n * args.m; } else { // scalar flops return 2 * args.m * args.n * args.m; } } static size_t GetBytes(const Arguments &args) { return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XSYMM_H_ #endif CLBlast-1.6.3/test/routines/level3/xsyr2k.hpp000066400000000000000000000204501463263031500210050ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xsyr2k routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XSYR2K_H_ #define CLBLAST_TEST_ROUTINES_XSYR2K_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXsyr2k { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 3; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgK, kArgLayout, kArgTriangle, kArgATransp, kArgALeadDim, kArgBLeadDim, kArgCLeadDim, kArgAOffset, kArgBOffset, kArgCOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } static std::vector BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); auto a_two = (a_rotated) ? args.n : args.k; return a_two * args.a_ld + args.a_offset; } static size_t GetSizeB(const Arguments &args) { auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); auto b_two = (b_rotated) ? args.n : args.k; return b_two * args.b_ld + args.b_offset; } static size_t GetSizeC(const Arguments &args) { return args.n * args.c_ld + args.c_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); args.c_size = GetSizeC(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.k; } static size_t DefaultLDB(const Arguments &args) { return args.k; } static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {Transpose::kNo, Transpose::kYes}; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2k(args.layout, args.triangle, args.a_transpose, args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Syr2k(args.layout, args.triangle, args.a_transpose, args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyr2k(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXsyr2k(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), args.n, args.k, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.b_mat, args.b_offset, args.b_ld, args.beta, buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXsyr2k(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), convertToCUBLAS(args.a_transpose), args.n, args.k, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return id1*args.c_ld + id2 + args.c_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 2 * args.n * args.n * args.k; } static size_t GetBytes(const Arguments &args) { return (args.n*args.k + args.n*args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XSYR2K_H_ #endif CLBlast-1.6.3/test/routines/level3/xsyrk.hpp000066400000000000000000000173531463263031500207330ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xsyrk routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XSYRK_H_ #define CLBLAST_TEST_ROUTINES_XSYRK_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXsyrk { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 3; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgK, kArgLayout, kArgTriangle, kArgATransp, kArgALeadDim, kArgCLeadDim, kArgAOffset, kArgCOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatC}; } static std::vector BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); auto a_two = (a_rotated) ? args.n : args.k; return a_two * args.a_ld + args.a_offset; } static size_t GetSizeC(const Arguments &args) { return args.n * args.c_ld + args.c_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.c_size = GetSizeC(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.k; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {Transpose::kNo, Transpose::kYes}; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Syrk(args.layout, args.triangle, args.a_transpose, args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Syrk(args.layout, args.triangle, args.a_transpose, args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyrk(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXsyrk(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), args.n, args.k, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, args.beta, buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXsyrk(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.triangle), convertToCUBLAS(args.a_transpose), args.n, args.k, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, args.beta, buffers.c_mat, args.c_offset, args.c_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return id1*args.c_ld + id2 + args.c_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { if((args.precision == Precision::kComplexSingle) || (args.precision == Precision::kComplexDouble)) { // complex flops return 4 * args.n * args.n * args.k; } else { // scalar flops return args.n * args.n * args.k; } } static size_t GetBytes(const Arguments &args) { return (args.n*args.k + args.n*args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XSYRK_H_ #endif CLBlast-1.6.3/test/routines/level3/xtrmm.hpp000066400000000000000000000202421463263031500207110ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xtrmm routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XTRMM_H_ #define CLBLAST_TEST_ROUTINES_XTRMM_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXtrmm { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 3; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgLayout, kArgSide, kArgTriangle, kArgATransp, kArgDiagonal, kArgALeadDim, kArgBLeadDim, kArgAOffset, kArgBOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB}; } static std::vector BuffersOut() { return {kBufMatB}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { auto k = (args.side == Side::kLeft) ? args.m : args.n; return k * args.a_ld + args.a_offset; } static size_t GetSizeB(const Arguments &args) { auto b_rotated = (args.layout == Layout::kRowMajor); auto b_two = (b_rotated) ? args.m : args.n; return b_two * args.b_ld + args.b_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.m; } static size_t DefaultLDB(const Arguments &args) { return args.n; } static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &all) { return all; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrmm(convertToCLBLAS(args.layout), convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.m, args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXtrmm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), convertToCBLAS(args.diagonal), args.m, args.n, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.b_mat, args.b_offset, args.b_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXtrmm(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.side), convertToCUBLAS(args.triangle), convertToCUBLAS(args.a_transpose), convertToCUBLAS(args.diagonal), args.m, args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.b_size, static_cast(0)); buffers.b_mat.Read(queue, args.b_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.m; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return (args.layout == Layout::kRowMajor) ? id1*args.b_ld + id2 + args.b_offset: id2*args.b_ld + id1 + args.b_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { auto k = (args.side == Side::kLeft) ? args.m : args.n; if((args.precision == Precision::kComplexSingle) || (args.precision == Precision::kComplexDouble)) { // complex flops return 4 * args.m * args.n * k; } else { // scalar flops return args.m * args.n * k; } } static size_t GetBytes(const Arguments &args) { auto k = (args.side == Side::kLeft) ? args.m : args.n; return (k*k + 2*args.m*args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XTRMM_H_ #endif CLBlast-1.6.3/test/routines/level3/xtrsm.hpp000066400000000000000000000213251463263031500207220ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xtrsm routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XTRSM_H_ #define CLBLAST_TEST_ROUTINES_XTRSM_H_ #include "test/routines/common.hpp" #include "test/routines/level3/xtrsm_data.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXtrsm { public: // The BLAS level: 1, 2, or 3 static size_t BLASLevel() { return 3; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgLayout, kArgSide, kArgTriangle, kArgATransp, kArgDiagonal, kArgALeadDim, kArgBLeadDim, kArgAOffset, kArgBOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB}; } static std::vector BuffersOut() { return {kBufMatB}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { const auto k = (args.side == Side::kLeft) ? args.m : args.n; return k * args.a_ld + args.a_offset; } static size_t GetSizeB(const Arguments &args) { const auto b_rotated = (args.layout == Layout::kRowMajor); const auto b_two = (b_rotated) ? args.m : args.n; return b_two * args.b_ld + args.b_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.m; } static size_t DefaultLDB(const Arguments &args) { return args.n; } static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &all) { return all; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments &args, Queue&, const int seed, std::vector&, std::vector&, std::vector& a_source_, std::vector& b_source_, std::vector&, std::vector&, std::vector&) { const auto k = (args.side == Side::kLeft) ? args.m : args.n; const auto b_one = (args.layout == Layout::kRowMajor) ? args.n : args.m; if (args.a_ld < k) { return; } if (args.b_ld < b_one) { return; } if (args.a_size <= 0 || args.b_size <= 0) { return; } // TODO: This is a copy of the clBLAS random matrix generation, make it work properly GenerateProperTrsmMatrices(args, seed, &a_source_[args.a_offset], &b_source_[args.b_offset]); } // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrsm(convertToCLBLAS(args.layout), convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.m, args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXtrsm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), convertToCBLAS(args.diagonal), args.m, args.n, args.alpha, buffers_host.a_mat, args.a_offset, args.a_ld, buffers_host.b_mat, args.b_offset, args.b_ld); return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXtrsm(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.side), convertToCUBLAS(args.triangle), convertToCUBLAS(args.a_transpose), convertToCUBLAS(args.diagonal), args.m, args.n, args.alpha, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat, args.b_offset, args.b_ld); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.b_size, static_cast(0)); buffers.b_mat.Read(queue, args.b_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.m; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return (args.layout == Layout::kRowMajor) ? id1*args.b_ld + id2 + args.b_offset: id2*args.b_ld + id1 + args.b_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { auto k = (args.side == Side::kLeft) ? args.m : args.n; if((args.precision == Precision::kComplexSingle) || (args.precision == Precision::kComplexDouble)) { // complex flops return 4 * args.m * args.n * k; } else { // scalar flops return args.m * args.n * k; } } static size_t GetBytes(const Arguments &args) { auto k = (args.side == Side::kLeft) ? args.m : args.n; return (k*k + 2*args.m*args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XTRSM_H_ #endif CLBlast-1.6.3/test/routines/level3/xtrsm_data.hpp000066400000000000000000000163331463263031500217160ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements data-prepration routines for proper input for the TRSM routine. Note: The // data-preparation routines are taken from clBLAS // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XTRSM_DATA_H_ #define CLBLAST_TEST_ROUTINES_XTRSM_DATA_H_ #include #include #include #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= // Limits to prepare proper input data template double TrsmLimitMatA(); template <> double TrsmLimitMatA() { return pow(2.0, 7); } template <> double TrsmLimitMatA() { return pow(2.0, 5); } template <> double TrsmLimitMatA() { return TrsmLimitMatA(); } template <> double TrsmLimitMatA() { return TrsmLimitMatA(); } template double TrsmLimitMatB(); template <> double TrsmLimitMatB() { return pow(2.0, 16); } template <> double TrsmLimitMatB() { return pow(2.0, 47); } template <> double TrsmLimitMatB() { return TrsmLimitMatB(); } template <> double TrsmLimitMatB() { return TrsmLimitMatB(); } // Matrix element setter template void SetElement(const clblast::Layout layout, const size_t row, const size_t column, T *mat, const size_t ld, const T value) { if (layout == clblast::Layout::kRowMajor) { mat[column + ld * row] = value; } else { mat[row + ld * column] = value; } } // Matrix element getter template T GetElement(const clblast::Layout layout, const size_t row, const size_t column, const T *mat, const size_t ld) { if (layout == clblast::Layout::kRowMajor) { return mat[column + ld * row]; } else { return mat[row + ld * column]; } } // Bounds a value between 'left' and 'right'. The random value is assumed to be between -1 and +1. template T BoundRandom(const double rand_val, const double left, const double right) { const auto value = Constant(rand_val * (right - left)); if (AbsoluteValue(value) < 0.0) { return value - Constant(left); } else { return value + Constant(left); } } // The clBLAS function to generate proper input matrices for matrices A & B. Note that this routine // should remain deterministic. Random values are therefore taken from the existing input, which // is scaled between -1 and +1. template void GenerateProperTrsmMatrices(const Arguments &args, const int seed, T *mat_a, T *mat_b) { // Random number generator std::mt19937 mt(seed); std::uniform_real_distribution dist(-1.0, 1.0); const auto k = (args.side == Side::kLeft) ? args.m : args.n; // Determines: max(|a_{ii}|) and min(|a_{ii}|) // Generates: a_{ii} which are constrainted by min/max auto min = ConstantZero(); if (args.diagonal == clblast::Diagonal::kUnit) { for (auto i = size_t{0}; i < k; ++i) { SetElement(args.layout, i, i, mat_a, args.a_ld, ConstantOne()); // must not be accessed } } else { auto max = Constant(dist(mt) * TrsmLimitMatA()); if (AbsoluteValue(max) < 1.0) { max += Constant(3.0); } // no zero's on the diagonal min = max / Constant(100.0); SetElement(args.layout, 0, 0, mat_a, args.a_ld, max); for (auto i = size_t{1}; i < k; ++i) { auto value = BoundRandom(dist(mt), AbsoluteValue(min), AbsoluteValue(max)); if (AbsoluteValue(value) == 0) { value = max; } SetElement(args.layout, i, i, mat_a, args.a_ld, value); } } // Generates a_{ij} for all j <> i. for (auto i = size_t{0}; i < k; ++i) { auto sum = (args.diagonal == clblast::Diagonal::kUnit) ? AbsoluteValue(ConstantOne()) : AbsoluteValue(GetElement(args.layout, i, i, mat_a, args.a_ld)); for (auto j = size_t{0}; j < k; ++j) { if (j == i) { continue; } auto value = ConstantZero(); if (((args.triangle == clblast::Triangle::kUpper) && (j > i)) || ((args.triangle == clblast::Triangle::kLower) && (j < i))) { if (sum >= 1.0) { const auto limit = sum / std::sqrt(static_cast(k) - static_cast(j)); value = Constant(dist(mt) * limit); sum -= AbsoluteValue(value); } } SetElement(args.layout, i, j, mat_a, args.a_ld, value); } } // Generate matrix B if (args.side == clblast::Side::kLeft) { for (auto j = size_t{0}; j < args.n; ++j) { auto sum = TrsmLimitMatB(); for (auto i = size_t{0}; i < args.m; ++i) { const auto a_value = GetElement(args.layout, i, i, mat_a, args.a_ld); auto value = ConstantZero(); if (sum >= 0.0) { const auto limit = sum * AbsoluteValue(a_value) / std::sqrt(static_cast(args.m) - static_cast(i)); value = Constant(dist(mt) * limit); sum -= AbsoluteValue(value) / AbsoluteValue(a_value); } SetElement(args.layout, i, j, mat_b, args.b_ld, value); if ((i == 0 && j == 0) || (AbsoluteValue(value) < AbsoluteValue(min))) { min = value; } } } } else { for (auto i = size_t{0}; i < args.m; ++i) { auto sum = TrsmLimitMatB(); for (auto j = size_t{0}; j < args.n; ++j) { const auto a_value = GetElement(args.layout, j, j, mat_a, args.a_ld); auto value = ConstantZero(); if (sum >= 0.0) { const auto limit = sum * AbsoluteValue(a_value) / std::sqrt(static_cast(args.n) - static_cast(j)); value = Constant(dist(mt) * limit); sum -= AbsoluteValue(value) / AbsoluteValue(a_value); } SetElement(args.layout, i, j, mat_b, args.b_ld, value); if ((i == 0 && j == 0) || (AbsoluteValue(value) < AbsoluteValue(min))) { min = value; } } } } if (args.diagonal == clblast::Diagonal::kUnit) { for (auto i = size_t{0}; i < k; ++i) { SetElement(args.layout, i, i, mat_a, args.a_ld, ConstantOne()); // must not be accessed } } // Calculate a proper alpha if (AbsoluteValue(min) > AbsoluteValue(args.alpha)) { // Not implemented } // Adjust matrix B according to the value of alpha if (AbsoluteValue(args.alpha) != 1.0 && AbsoluteValue(args.alpha) != 0.0) { for (auto i = size_t{0}; i < args.m; ++i) { for (auto j = size_t{0}; j < args.n; ++j) { auto value = GetElement(args.layout, i, j, mat_b, args.b_ld); value /= args.alpha; SetElement(args.layout, i, j, mat_b, args.b_ld, value); } } } } // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XTRSM_DATA_H_ #endif CLBlast-1.6.3/test/routines/levelx/000077500000000000000000000000001463263031500171365ustar00rootroot00000000000000CLBlast-1.6.3/test/routines/levelx/xaxpybatched.hpp000066400000000000000000000177611463263031500223470ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the XaxpyBatched routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_ #define CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXaxpyBatched { public: // Although it is a non-BLAS routine, it can still be tested against level-1 routines in a loop static size_t BLASLevel() { return 1; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgXInc, kArgYInc, kArgBatchCount, kArgAlpha}; } static std::vector BuffersIn() { return {kBufVecX, kBufVecY}; } static std::vector BuffersOut() { return {kBufVecY}; } // Helper for the sizes per batch static size_t PerBatchSizeX(const Arguments &args) { return args.n * args.x_inc; } static size_t PerBatchSizeY(const Arguments &args) { return args.n * args.y_inc; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return PerBatchSizeX(args) * args.batch_count + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return PerBatchSizeY(args) * args.batch_count + args.y_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); // Also sets the batch-related variables args.x_offsets = std::vector(args.batch_count); args.y_offsets = std::vector(args.batch_count); args.alphas = std::vector(args.batch_count); for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { args.x_offsets[batch] = batch * PerBatchSizeX(args) + args.x_offset; args.y_offsets[batch] = batch * PerBatchSizeY(args) + args.y_offset; args.alphas[batch] = args.alpha + Constant(static_cast(batch + 1)); } } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = AxpyBatched(args.n, args.alphas.data(), buffers.x_vec(), args.x_offsets.data(), args.x_inc, buffers.y_vec(), args.y_offsets.data(), args.y_inc, args.batch_count, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = AxpyBatched(args.n, args.alphas.data(), buffers.x_vec(), args.x_offsets.data(), args.x_inc, buffers.y_vec(), args.y_offsets.data(), args.y_inc, args.batch_count, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { auto event = cl_event{}; auto status = clblasXaxpy(args.n, args.alphas[batch], buffers.x_vec, args.x_offsets[batch], args.x_inc, buffers.y_vec, args.y_offsets[batch], args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); if (static_cast(status) != StatusCode::kSuccess) { return static_cast(status); } } return StatusCode::kSuccess; } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { cblasXaxpy(args.n, args.alphas[batch], buffers_host.x_vec, args.x_offsets[batch], args.x_inc, buffers_host.y_vec, args.y_offsets[batch], args.y_inc); } return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { auto status = cublasXaxpy(reinterpret_cast(args.cublas_handle), args.n, args.alphas[batch], buffers.x_vec, args.x_offsets[batch], args.x_inc, buffers.y_vec, args.y_offsets[batch], args.y_inc); if (status != CUBLAS_STATUS_SUCCESS) { return StatusCode::kUnknownError; } } return StatusCode::kSuccess; } #endif // Describes how to download the results of the computation static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &args) { return args.batch_count; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return (id1 * args.y_inc) + args.y_offsets[id2]; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return args.batch_count * (2 * args.n); } static size_t GetBytes(const Arguments &args) { return args.batch_count * (3 * args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_ #endif CLBlast-1.6.3/test/routines/levelx/xcol2im.hpp000066400000000000000000000245521463263031500212340ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xcol2im routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XCOL2IM_H_ #define CLBLAST_TEST_ROUTINES_XCOL2IM_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXcol2im { public: // The BLAS level: 4 for the extra routines static size_t BLASLevel() { return 4; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgKernelMode, kArgChannels, kArgHeight, kArgWidth, kArgKernelH, kArgKernelW, kArgPadH, kArgPadW, kArgStrideH, kArgStrideW, kArgDilationH, kArgDilationW, kArgAOffset, kArgBOffset}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB}; } // b = col static std::vector BuffersOut() { return {kBufMatA}; } // a = im // Describes how to obtain the sizes of the buffers static size_t ColHeight(const Arguments &args) { const auto size = args.height + 2 * args.pad_h; const auto padding = args.dilation_h * (args.kernel_h - 1) + 1; if (size >= padding) { return (size - padding) / args.stride_h + 1; } return 1; } static size_t ColWidth(const Arguments &args) { const auto size = args.width + 2 * args.pad_w; const auto padding = args.dilation_w * (args.kernel_w - 1) + 1; if (size >= padding) { return (size - padding) / args.stride_w + 1; } return 1; } static size_t NumPatches(const Arguments &args) { return ColHeight(args) * ColWidth(args) * args.channels; } static size_t GetSizeA(const Arguments &args) { return args.height * args.width * args.channels + args.a_offset; } static size_t GetSizeB(const Arguments &args) { return args.kernel_w * args.kernel_h * NumPatches(args) + args.b_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); // im args.b_size = GetSizeB(args); // col } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Col2im(args.kernel_mode, args.channels, args.height, args.width, args.kernel_h, args.kernel_w, args.pad_h, args.pad_w, args.stride_h, args.stride_w, args.dilation_h, args.dilation_w, buffers.b_mat(), args.b_offset, // col buffers.a_mat(), args.a_offset, // im &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Col2im(args.kernel_mode, args.channels, args.height, args.width, args.kernel_h, args.kernel_w, args.pad_h, args.pad_w, args.stride_h, args.stride_w, args.dilation_h, args.dilation_w, buffers.b_mat(), args.b_offset, // col buffers.a_mat(), args.a_offset, // im queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto buffers_host = BuffersHost(); DeviceToHost(args, buffers, buffers_host, queue, BuffersIn()); const auto status = RunReference(args, buffers_host); HostToDevice(args, buffers, buffers_host, queue, BuffersOut()); return status; } static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue&) { return RunReference(args, buffers_host); } static StatusCode RunReference3(const Arguments &, BuffersCUDA &, Queue &) { return StatusCode::kUnknownError; } // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.a_size, static_cast(0)); buffers.a_mat.Read(queue, args.a_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.height * args.width; } static size_t ResultID2(const Arguments &args) { return args.channels; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return id1 + args.height * args.width * id2 + args.a_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &) { return 1; } static size_t GetBytes(const Arguments &args) { const auto im = args.channels * args.width * args.height; // possibly less with striding const auto col = args.kernel_h * args.kernel_w * NumPatches(args); return (im + col) * sizeof(T); } }; // ================================================================================================= template StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { // Reference taken from im2col but swapped the input/output const auto col_h = TestXcol2im::ColHeight(args); const auto col_w = TestXcol2im::ColWidth(args); for (auto c_id = size_t{0}; c_id < args.channels; ++c_id) { // image channels for (auto kh_id = size_t{0}; kh_id < args.kernel_h; ++kh_id) { // kernel height for (auto kw_id = size_t{0}; kw_id < args.kernel_w; ++kw_id) { // kernel width for (auto h_id = size_t{0}; h_id < col_h; ++h_id) { // image height for (auto w_id = size_t{0}; w_id < col_w; ++w_id) { // image width // Reads the input value const auto kernel_index = (args.kernel_mode == KernelMode::kConvolution) ? args.kernel_h * args.kernel_w - kw_id - args.kernel_w * kh_id - 1 : kw_id + args.kernel_w * kh_id; const auto patch_index = w_id + col_w * h_id; const auto col_index = patch_index + kernel_index * col_w * col_h + c_id * col_w * col_h * args.kernel_h * args.kernel_w; const auto val = buffers_host.b_mat[col_index + args.b_offset]; // Sets the output value const auto h_index = kh_id * args.dilation_h + args.stride_h * h_id - args.pad_h; const auto w_index = kw_id * args.dilation_w + args.stride_w * w_id - args.pad_w; if (h_index >= 0 && h_index < args.height && w_index >= 0 && w_index < args.width) { const auto im_index = w_index + args.width * (h_index + args.height * c_id); buffers_host.a_mat[im_index + args.a_offset] += val; } } } } } } return StatusCode::kSuccess; } // Half-precision version calling the above reference implementation after conversions template <> StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat); auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat); auto dummy = std::vector(0); auto dummy_uint = std::vector(0); auto buffers2 = BuffersHost{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy, dummy_uint}; auto args2 = Arguments(); args2.a_size = args.a_size; args2.b_size = args.b_size; args2.kernel_mode = args.kernel_mode; args2.channels = args.channels; args2.height = args.height; args2.width = args.width; args2.kernel_h = args.kernel_h; args2.kernel_w = args.kernel_w; args2.pad_h = args.pad_h; args2.pad_w = args.pad_w; args2.stride_h = args.stride_h; args2.stride_w = args.stride_w; args2.dilation_h = args.dilation_h; args2.dilation_w = args.dilation_w; args2.a_offset = args.a_offset; args2.b_offset = args.b_offset; auto status = RunReference(args2, buffers2); FloatToHalfBuffer(buffers_host.a_mat, buffers2.a_mat); return status; } // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XCOL2IM_H_ #endif CLBlast-1.6.3/test/routines/levelx/xconvgemm.hpp000066400000000000000000000304701463263031500216560ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xconvgemm routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XCONVGEMM_H_ #define CLBLAST_TEST_ROUTINES_XCONVGEMM_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXconvgemm { public: // The BLAS level: 4 for the extra routines static size_t BLASLevel() { return 4; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgKernelMode, kArgChannels, kArgHeight, kArgWidth, kArgKernelH, kArgKernelW, kArgPadH, kArgPadW, kArgStrideH, kArgStrideW, kArgDilationH, kArgDilationW, kArgNumKernels, kArgBatchCount, kArgAOffset, kArgBOffset, kArgCOffset}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } static std::vector BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t OutputHeight(const Arguments &args) { const auto size = args.height + 2 * args.pad_h; const auto padding = args.dilation_h * (args.kernel_h - 1) + 1; if (size >= padding) { return (size - padding) / args.stride_h + 1; } return 1; } static size_t OutputWidth(const Arguments &args) { const auto size = args.width + 2 * args.pad_w; const auto padding = args.dilation_w * (args.kernel_w - 1) + 1; if (size >= padding) { return (size - padding) / args.stride_w + 1; } return 1; } static size_t NumPatches(const Arguments &args) { return OutputHeight(args) * OutputWidth(args) * args.channels; } static size_t GetSizeA(const Arguments &args) { // 4D: NCHW == batch-channel-height-width return args.batch_count * args.channels * args.height * args.width + args.a_offset; } static size_t GetSizeB(const Arguments &args) { // 4D: KCHW == kernel-channel-height-width return args.num_kernels * args.channels * args.kernel_h * args.kernel_w + args.b_offset; } static size_t GetSizeC(const Arguments &args) { // 4D: NCHW == batch-channel-height-width return args.batch_count * args.num_kernels * OutputHeight(args) * OutputWidth(args) + args.c_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); args.c_size = GetSizeC(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Convgemm(args.kernel_mode, args.channels, args.height, args.width, args.kernel_h, args.kernel_w, args.pad_h, args.pad_w, args.stride_h, args.stride_w, args.dilation_h, args.dilation_w, args.num_kernels, args.batch_count, buffers.a_mat(), args.a_offset, buffers.b_mat(), args.b_offset, buffers.c_mat(), args.c_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Convgemm(args.kernel_mode, args.channels, args.height, args.width, args.kernel_h, args.kernel_w, args.pad_h, args.pad_w, args.stride_h, args.stride_w, args.dilation_h, args.dilation_w, args.num_kernels, args.batch_count, buffers.a_mat(), args.a_offset, buffers.b_mat(), args.b_offset, buffers.c_mat(), args.c_offset, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto buffers_host = BuffersHost(); DeviceToHost(args, buffers, buffers_host, queue, BuffersIn()); const auto status = RunReference(args, buffers_host); HostToDevice(args, buffers, buffers_host, queue, BuffersOut()); return status; } static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue&) { return RunReference(args, buffers_host); } static StatusCode RunReference3(const Arguments &, BuffersCUDA &, Queue &) { return StatusCode::kUnknownError; } // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return OutputHeight(args) * OutputWidth(args); } static size_t ResultID2(const Arguments &args) { return args.num_kernels * args.batch_count; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return id1 + OutputHeight(args) * OutputWidth(args) * id2 + args.c_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { const auto patch_size = args.kernel_h * args.kernel_w * args.channels; const auto num_patches = OutputHeight(args) * OutputWidth(args); return args.batch_count * 2 * num_patches * args.num_kernels * patch_size; } static size_t GetBytes(const Arguments &args) { return (GetSizeA(args) + GetSizeB(args) + GetSizeC(args)) * sizeof(T); } }; // ================================================================================================= template StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { const auto output_h = TestXconvgemm::OutputHeight(args); const auto output_w = TestXconvgemm::OutputWidth(args); for (auto batch_id = size_t{0}; batch_id < args.batch_count; ++batch_id) { for (auto co_id = size_t{0}; co_id < args.num_kernels; ++co_id) { // output channels == num-kernels for (auto ho_id = size_t{0}; ho_id < output_h; ++ho_id) { // image height for (auto wo_id = size_t{0}; wo_id < output_w; ++wo_id) { // image width auto result = ConstantZero(); // 3D convolution for (auto ci_id = size_t{0}; ci_id < args.channels; ++ci_id) { // input channels for (auto kh_id = size_t{0}; kh_id < args.kernel_h; ++kh_id) { // kernel height for (auto kw_id = size_t{0}; kw_id < args.kernel_w; ++kw_id) { // kernel width // Retrieves the value from the input image const auto hi_id = kh_id * args.dilation_h + args.stride_h * ho_id - args.pad_h; const auto wi_id = kw_id * args.dilation_w + args.stride_w * wo_id - args.pad_w; if (hi_id >= 0 && hi_id < args.height && wi_id >= 0 && wi_id < args.width) { const auto input_index = wi_id + args.width * ( hi_id + args.height * ( ci_id + args.channels * ( batch_id))); const auto input_value = buffers_host.a_mat[input_index + args.a_offset]; // Multiplies with the kernel tensor const auto kernel_index = (args.kernel_mode == KernelMode::kConvolution) ? (args.kernel_w - kw_id - 1) + args.kernel_w * ( (args.kernel_h - kh_id - 1) + args.kernel_h * ( ci_id + args.channels * ( co_id))) : kw_id + args.kernel_w * ( kh_id + args.kernel_h * ( ci_id + args.channels * ( co_id))); const auto kernel_value = buffers_host.b_mat[kernel_index + args.b_offset]; result += input_value * kernel_value; } } } } // Sets the output value (NCHW == batch-channel-height-width) const auto output_index = wo_id + output_w * ( ho_id + output_h * ( co_id + args.num_kernels * ( batch_id))); buffers_host.c_mat[output_index + args.c_offset] = result; } } } } return StatusCode::kSuccess; } // Half-precision version calling the above reference implementation after conversions template <> StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat); auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat); auto c_buffer2 = HalfToFloatBuffer(buffers_host.c_mat); auto dummy = std::vector(0); auto dummy_uint = std::vector(0); auto buffers2 = BuffersHost{dummy, dummy, a_buffer2, b_buffer2, c_buffer2, dummy, dummy, dummy_uint}; auto args2 = Arguments(); args2.a_size = args.a_size; args2.b_size = args.b_size; args2.c_size = args.c_size; args2.kernel_mode = args.kernel_mode; args2.channels = args.channels; args2.height = args.height; args2.width = args.width; args2.kernel_h = args.kernel_h; args2.kernel_w = args.kernel_w; args2.pad_h = args.pad_h; args2.pad_w = args.pad_w; args2.stride_h = args.stride_h; args2.stride_w = args.stride_w; args2.dilation_h = args.dilation_h; args2.dilation_w = args.dilation_w; args2.num_kernels = args.num_kernels; args2.batch_count = args.batch_count; args2.a_offset = args.a_offset; args2.b_offset = args.b_offset; args2.c_offset = args.c_offset; auto status = RunReference(args2, buffers2); FloatToHalfBuffer(buffers_host.c_mat, buffers2.c_mat); return status; } // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XCONVGEMM_H_ #endif CLBlast-1.6.3/test/routines/levelx/xgemmbatched.hpp000066400000000000000000000271321463263031500223040ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the XgemmBatched routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XGEMMBATCHED_H_ #define CLBLAST_TEST_ROUTINES_XGEMMBATCHED_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXgemmBatched { public: // Although it is a non-BLAS routine, it can still be tested against level-3 routines in a loop static size_t BLASLevel() { return 3; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgK, kArgLayout, kArgATransp, kArgBTransp, kArgALeadDim, kArgBLeadDim, kArgCLeadDim, kArgAOffset, kArgBOffset, kArgCOffset, kArgBatchCount, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } static std::vector BuffersOut() { return {kBufMatC}; } // Helper for the sizes per batch static size_t PerBatchSizeA(const Arguments &args) { auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); auto a_two = (a_rotated) ? args.m : args.k; return a_two * args.a_ld; } static size_t PerBatchSizeB(const Arguments &args) { auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo); auto b_two = (b_rotated) ? args.k : args.n; return b_two * args.b_ld; } static size_t PerBatchSizeC(const Arguments &args) { auto c_rotated = (args.layout == Layout::kRowMajor); auto c_two = (c_rotated) ? args.m : args.n; return c_two * args.c_ld; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { return PerBatchSizeA(args) * args.batch_count + args.a_offset; } static size_t GetSizeB(const Arguments &args) { return PerBatchSizeB(args) * args.batch_count + args.b_offset; } static size_t GetSizeC(const Arguments &args) { return PerBatchSizeC(args) * args.batch_count + args.c_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); args.c_size = GetSizeC(args); // Also sets the batch-related variables args.a_offsets = std::vector(args.batch_count); args.b_offsets = std::vector(args.batch_count); args.c_offsets = std::vector(args.batch_count); args.alphas = std::vector(args.batch_count); args.betas = std::vector(args.batch_count); for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { args.a_offsets[batch] = batch * PerBatchSizeA(args) + args.a_offset; args.b_offsets[batch] = batch * PerBatchSizeB(args) + args.b_offset; args.c_offsets[batch] = batch * PerBatchSizeC(args) + args.c_offset; args.alphas[batch] = args.alpha + Constant(static_cast(batch + 1)); args.betas[batch] = args.beta + Constant(static_cast(batch + 1)); } } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.k; } static size_t DefaultLDB(const Arguments &args) { return args.n; } static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &all) { return all; } static Transposes GetBTransposes(const Transposes &all) { return all; } // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { // Relaxed requirement on ld_a and ld_b within the library, this is here to match clBLAS auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo); auto a_one = (!a_rotated) ? args.m : args.k; auto b_one = (!b_rotated) ? args.k : args.n; if (args.a_ld < a_one) { return StatusCode::kInvalidLeadDimA; } if (args.b_ld < b_one) { return StatusCode::kInvalidLeadDimB; } #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose, args.m, args.n, args.k, args.alphas.data(), buffers.a_mat(), args.a_offsets.data(), args.a_ld, buffers.b_mat(), args.b_offsets.data(), args.b_ld, args.betas.data(), buffers.c_mat(), args.c_offsets.data(), args.c_ld, args.batch_count, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose, args.m, args.n, args.k, args.alphas.data(), buffers.a_mat(), args.a_offsets.data(), args.a_ld, buffers.b_mat(), args.b_offsets.data(), args.b_ld, args.betas.data(), buffers.c_mat(), args.c_offsets.data(), args.c_ld, args.batch_count, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { auto event = cl_event{}; auto status = clblasXgemm(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.b_transpose), args.m, args.n, args.k, args.alphas[batch], buffers.a_mat, args.a_offsets[batch], args.a_ld, buffers.b_mat, args.b_offsets[batch], args.b_ld, args.betas[batch], buffers.c_mat, args.c_offsets[batch], args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); if (static_cast(status) != StatusCode::kSuccess) { return static_cast(status); } } return StatusCode::kSuccess; } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { cblasXgemm(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), convertToCBLAS(args.b_transpose), args.m, args.n, args.k, args.alphas[batch], buffers_host.a_mat, args.a_offsets[batch], args.a_ld, buffers_host.b_mat, args.b_offsets[batch], args.b_ld, args.betas[batch], buffers_host.c_mat, args.c_offsets[batch], args.c_ld); } return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { auto status = cublasXgemm(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.a_transpose), convertToCUBLAS(args.b_transpose), args.m, args.n, args.k, args.alphas[batch], buffers.a_mat, args.a_offsets[batch], args.a_ld, buffers.b_mat, args.b_offsets[batch], args.b_ld, args.betas[batch], buffers.c_mat, args.c_offsets[batch], args.c_ld); if (status != CUBLAS_STATUS_SUCCESS) { return StatusCode::kUnknownError; } } return StatusCode::kSuccess; } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.m; } static size_t ResultID2(const Arguments &args) { return args.n * args.batch_count; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2_3) { const size_t id2 = id2_3 % args.n; const size_t id3 = id2_3 / args.n; return (args.layout == Layout::kRowMajor) ? id1*args.c_ld + id2 + args.c_offsets[id3]: id2*args.c_ld + id1 + args.c_offsets[id3]; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { if((args.precision == Precision::kComplexSingle) || (args.precision == Precision::kComplexDouble)) { // complex flops return args.batch_count * args.m * args.n * (8 * args.k - 2); } else { // scalar flops return args.batch_count * args.m * args.n * (2 * args.k - 1); } } static size_t GetBytes(const Arguments &args) { return args.batch_count * (args.m*args.k + args.k*args.n + 2*args.m*args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XGEMMBATCHED_H_ #endif CLBlast-1.6.3/test/routines/levelx/xgemmstridedbatched.hpp000066400000000000000000000257221463263031500236660ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the XgemmStridedBatched routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XGEMMSTRIDEDBATCHED_H_ #define CLBLAST_TEST_ROUTINES_XGEMMSTRIDEDBATCHED_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXgemmStridedBatched { public: // Although it is a non-BLAS routine, it can still be tested against level-3 routines in a loop static size_t BLASLevel() { return 3; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgK, kArgLayout, kArgATransp, kArgBTransp, kArgALeadDim, kArgBLeadDim, kArgCLeadDim, kArgAOffset, kArgBOffset, kArgCOffset, kArgBatchCount, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } static std::vector BuffersOut() { return {kBufMatC}; } // Helper for the sizes per batch static size_t PerBatchSizeA(const Arguments &args) { auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); auto a_two = (a_rotated) ? args.m : args.k; return a_two * args.a_ld; } static size_t PerBatchSizeB(const Arguments &args) { auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo); auto b_two = (b_rotated) ? args.k : args.n; return b_two * args.b_ld; } static size_t PerBatchSizeC(const Arguments &args) { auto c_rotated = (args.layout == Layout::kRowMajor); auto c_two = (c_rotated) ? args.m : args.n; return c_two * args.c_ld; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { return PerBatchSizeA(args) * args.batch_count + args.a_offset; } static size_t GetSizeB(const Arguments &args) { return PerBatchSizeB(args) * args.batch_count + args.b_offset; } static size_t GetSizeC(const Arguments &args) { return PerBatchSizeC(args) * args.batch_count + args.c_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); args.c_size = GetSizeC(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.k; } static size_t DefaultLDB(const Arguments &args) { return args.n; } static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &all) { return all; } static Transposes GetBTransposes(const Transposes &all) { return all; } // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = GemmStridedBatched(args.layout, args.a_transpose, args.b_transpose, args.m, args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, PerBatchSizeA(args), buffers.b_mat(), args.b_offset, args.b_ld, PerBatchSizeB(args), args.beta, buffers.c_mat(), args.c_offset, args.c_ld, PerBatchSizeC(args), args.batch_count, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = GemmStridedBatched(args.layout, args.a_transpose, args.b_transpose, args.m, args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, PerBatchSizeA(args), buffers.b_mat(), args.b_offset, args.b_ld, PerBatchSizeB(args), args.beta, buffers.c_mat(), args.c_offset, args.c_ld, PerBatchSizeC(args), args.batch_count, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { const auto a_batch_offset = args.a_offset + PerBatchSizeA(args) * batch; const auto b_batch_offset = args.c_offset + PerBatchSizeB(args) * batch; const auto c_batch_offset = args.b_offset + PerBatchSizeC(args) * batch; auto event = cl_event{}; auto status = clblasXgemm(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, buffers.a_mat, a_batch_offset, args.a_ld, buffers.b_mat, b_batch_offset, args.b_ld, args.beta, buffers.c_mat, c_batch_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); if (static_cast(status) != StatusCode::kSuccess) { return static_cast(status); } } return StatusCode::kSuccess; } #endif // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { const auto a_batch_offset = args.a_offset + PerBatchSizeA(args) * batch; const auto b_batch_offset = args.c_offset + PerBatchSizeB(args) * batch; const auto c_batch_offset = args.b_offset + PerBatchSizeC(args) * batch; cblasXgemm(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), convertToCBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, buffers_host.a_mat, a_batch_offset, args.a_ld, buffers_host.b_mat, b_batch_offset, args.b_ld, args.beta, buffers_host.c_mat, c_batch_offset, args.c_ld); } return StatusCode::kSuccess; } #endif // Describes how to run the cuBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { const auto a_batch_offset = args.a_offset + PerBatchSizeA(args) * batch; const auto b_batch_offset = args.c_offset + PerBatchSizeB(args) * batch; const auto c_batch_offset = args.b_offset + PerBatchSizeC(args) * batch; auto status = cublasXgemm(reinterpret_cast(args.cublas_handle), args.layout, convertToCUBLAS(args.a_transpose), convertToCUBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, buffers.a_mat, a_batch_offset, args.a_ld, buffers.b_mat, b_batch_offset, args.b_ld, args.beta, buffers.c_mat, c_batch_offset, args.c_ld); if (status != CUBLAS_STATUS_SUCCESS) { return StatusCode::kUnknownError; } } return StatusCode::kSuccess; } #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.m; } static size_t ResultID2(const Arguments &args) { return args.n * args.batch_count; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2_3) { const size_t id2 = id2_3 % args.n; const size_t id3 = id2_3 / args.n; const auto c_batch_offset = args.c_offset + PerBatchSizeC(args) * id3; return (args.layout == Layout::kRowMajor) ? id1*args.c_ld + id2 + c_batch_offset: id2*args.c_ld + id1 + c_batch_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { if((args.precision == Precision::kComplexSingle) || (args.precision == Precision::kComplexDouble)) { // complex flops return args.batch_count * args.m * args.n * (8 * args.k - 2); } else { // scalar flops return args.batch_count * args.m * args.n * (2 * args.k - 1); } } static size_t GetBytes(const Arguments &args) { return args.batch_count * (args.m*args.k + args.k*args.n + 2*args.m*args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XGEMMSTRIDEDBATCHED_H_ #endif CLBlast-1.6.3/test/routines/levelx/xhad.hpp000066400000000000000000000172031463263031500205760ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xhad routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XHAD_H_ #define CLBLAST_TEST_ROUTINES_XHAD_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= template StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { for (auto index = size_t{0}; index < args.n; ++index) { const auto x = buffers_host.x_vec[index * args.x_inc + args.x_offset]; const auto y = buffers_host.y_vec[index * args.y_inc + args.y_offset]; const auto z = buffers_host.c_mat[index]; // * args.z_inc + args.z_offset]; buffers_host.c_mat[index] = args.alpha * x * y + args.beta * z; } return StatusCode::kSuccess; } // Half-precision version calling the above reference implementation after conversions template <> StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { auto x_buffer2 = HalfToFloatBuffer(buffers_host.x_vec); auto y_buffer2 = HalfToFloatBuffer(buffers_host.y_vec); auto c_buffer2 = HalfToFloatBuffer(buffers_host.c_mat); auto dummy = std::vector(0); auto dummy_uint = std::vector(0); auto buffers2 = BuffersHost{x_buffer2, y_buffer2, dummy, dummy, c_buffer2, dummy, dummy, dummy_uint}; auto args2 = Arguments(); args2.x_size = args.x_size; args2.y_size = args.y_size; args2.c_size = args.c_size; args2.x_inc = args.x_inc; args2.y_inc = args.y_inc; args2.n = args.n; args2.x_offset = args.x_offset; args2.y_offset = args.y_offset; args2.alpha = HalfToFloat(args.alpha); args2.beta = HalfToFloat(args.beta); auto status = RunReference(args2, buffers2); FloatToHalfBuffer(buffers_host.c_mat, buffers2.c_mat); return status; } // ================================================================================================= // See comment at top of file for a description of the class template class TestXhad { public: // The BLAS level: 4 for the extra routines (note: tested with matrix-size values for 'n') static size_t BLASLevel() { return 4; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgXInc, kArgYInc, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } static std::vector BuffersIn() { return {kBufVecX, kBufVecY, kBufMatC}; } static std::vector BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeY(const Arguments &args) { return args.n * args.y_inc + args.y_offset; } static size_t GetSizeC(const Arguments &args) { // used for 'vector z' return args.n; // * args.z_inc + args.z_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); args.c_size = GetSizeC(args); // used for 'vector z' } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Had(args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, args.beta, buffers.c_mat(), 0, 1, // used for 'vector z' &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Had(args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, args.beta, buffers.c_mat(), 0, 1, // used for 'vector z' queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto buffers_host = BuffersHost(); DeviceToHost(args, buffers, buffers_host, queue, BuffersIn()); const auto status = RunReference(args, buffers_host); HostToDevice(args, buffers, buffers_host, queue, BuffersOut()); return status; } static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue&) { return RunReference(args, buffers_host); } static StatusCode RunReference3(const Arguments &, BuffersCUDA &, Queue &) { return StatusCode::kUnknownError; } // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine static size_t GetResultIndex(const Arguments &, const size_t id1, const size_t) { return id1; // * args.z_inc + args.z_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return 4 * args.n; } static size_t GetBytes(const Arguments &args) { return (4 * args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XHAD_H_ #endif CLBlast-1.6.3/test/routines/levelx/xim2col.hpp000066400000000000000000000244521463263031500212330ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xim2col routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XIM2COL_H_ #define CLBLAST_TEST_ROUTINES_XIM2COL_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template class TestXim2col { public: // The BLAS level: 4 for the extra routines static size_t BLASLevel() { return 4; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgKernelMode, kArgChannels, kArgHeight, kArgWidth, kArgKernelH, kArgKernelW, kArgPadH, kArgPadW, kArgStrideH, kArgStrideW, kArgDilationH, kArgDilationW, kArgAOffset, kArgBOffset}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB}; } static std::vector BuffersOut() { return {kBufMatB}; } // Describes how to obtain the sizes of the buffers static size_t ColHeight(const Arguments &args) { const auto size = args.height + 2 * args.pad_h; const auto padding = args.dilation_h * (args.kernel_h - 1) + 1; if (size >= padding) { return (size - padding) / args.stride_h + 1; } return 1; } static size_t ColWidth(const Arguments &args) { const auto size = args.width + 2 * args.pad_w; const auto padding = args.dilation_w * (args.kernel_w - 1) + 1; if (size >= padding) { return (size - padding) / args.stride_w + 1; } return 1; } static size_t NumPatches(const Arguments &args) { return ColHeight(args) * ColWidth(args) * args.channels; } static size_t GetSizeA(const Arguments &args) { return args.height * args.width * args.channels + args.a_offset; } static size_t GetSizeB(const Arguments &args) { return args.kernel_w * args.kernel_h * NumPatches(args) + args.b_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Im2col(args.kernel_mode, args.channels, args.height, args.width, args.kernel_h, args.kernel_w, args.pad_h, args.pad_w, args.stride_h, args.stride_w, args.dilation_h, args.dilation_w, buffers.a_mat(), args.a_offset, buffers.b_mat(), args.b_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Im2col(args.kernel_mode, args.channels, args.height, args.width, args.kernel_h, args.kernel_w, args.pad_h, args.pad_w, args.stride_h, args.stride_w, args.dilation_h, args.dilation_w, buffers.a_mat(), args.a_offset, buffers.b_mat(), args.b_offset, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto buffers_host = BuffersHost(); DeviceToHost(args, buffers, buffers_host, queue, BuffersIn()); const auto status = RunReference(args, buffers_host); HostToDevice(args, buffers, buffers_host, queue, BuffersOut()); return status; } static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue&) { return RunReference(args, buffers_host); } static StatusCode RunReference3(const Arguments &, BuffersCUDA &, Queue &) { return StatusCode::kUnknownError; } // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.b_size, static_cast(0)); buffers.b_mat.Read(queue, args.b_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.kernel_h * args.kernel_w; } static size_t ResultID2(const Arguments &args) { return NumPatches(args); } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return id1 + args.kernel_h * args.kernel_w * id2 + args.b_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &) { return 1; } static size_t GetBytes(const Arguments &args) { const auto input = args.channels * args.width * args.height; // possibly less with striding const auto output = args.kernel_h * args.kernel_w * NumPatches(args); return (input + output) * sizeof(T); } }; // ================================================================================================= template StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { const auto col_h = TestXim2col::ColHeight(args); const auto col_w = TestXim2col::ColWidth(args); for (auto c_id = size_t{0}; c_id < args.channels; ++c_id) { // input channels for (auto kh_id = size_t{0}; kh_id < args.kernel_h; ++kh_id) { // kernel height for (auto kw_id = size_t{0}; kw_id < args.kernel_w; ++kw_id) { // kernel width for (auto h_id = size_t{0}; h_id < col_h; ++h_id) { // image height for (auto w_id = size_t{0}; w_id < col_w; ++w_id) { // image width // Retrieves the input value const auto h_index = kh_id * args.dilation_h + args.stride_h * h_id - args.pad_h; const auto w_index = kw_id * args.dilation_w + args.stride_w * w_id - args.pad_w; auto val = ConstantZero(); if (h_index >= 0 && h_index < args.height && w_index >= 0 && w_index < args.width) { const auto im_index = w_index + args.width * (h_index + args.height * c_id); val = buffers_host.a_mat[im_index + args.a_offset]; } // Sets the output value const auto kernel_index = (args.kernel_mode == KernelMode::kConvolution) ? args.kernel_h * args.kernel_w - kw_id - args.kernel_w * kh_id - 1 : kw_id + args.kernel_w * kh_id; const auto patch_index = w_id + col_w * h_id; const auto col_index = patch_index + kernel_index * col_w * col_h + c_id * col_w * col_h * args.kernel_h * args.kernel_w; buffers_host.b_mat[col_index + args.b_offset] = val; } } } } } return StatusCode::kSuccess; } // Half-precision version calling the above reference implementation after conversions template <> StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat); auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat); auto dummy = std::vector(0); auto dummy_uint = std::vector(0); auto buffers2 = BuffersHost{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy, dummy_uint}; auto args2 = Arguments(); args2.a_size = args.a_size; args2.b_size = args.b_size; args2.kernel_mode = args.kernel_mode; args2.channels = args.channels; args2.height = args.height; args2.width = args.width; args2.kernel_h = args.kernel_h; args2.kernel_w = args.kernel_w; args2.pad_h = args.pad_h; args2.pad_w = args.pad_w; args2.stride_h = args.stride_h; args2.stride_w = args.stride_w; args2.dilation_h = args.dilation_h; args2.dilation_w = args.dilation_w; args2.a_offset = args.a_offset; args2.b_offset = args.b_offset; auto status = RunReference(args2, buffers2); FloatToHalfBuffer(buffers_host.b_mat, buffers2.b_mat); return status; } // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XIM2COL_H_ #endif CLBlast-1.6.3/test/routines/levelx/xinvert.hpp000066400000000000000000000242561463263031500213570ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xinvert routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XINVERT_H_ #define CLBLAST_TEST_ROUTINES_XINVERT_H_ #include "test/routines/common.hpp" #include "src/routines/levelx/xinvert.hpp" namespace clblast { // ================================================================================================= template StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { const bool is_upper = ((args.triangle == Triangle::kUpper && args.layout != Layout::kRowMajor) || (args.triangle == Triangle::kLower && args.layout == Layout::kRowMajor)); // Helper variables const auto block_size = args.m; const auto num_blocks = CeilDiv(args.n, block_size); const auto a_ld = args.a_ld; const auto b_ld = block_size; // Checks for valid arguments if ((block_size == 0) || (args.n == 0)) { return StatusCode::kInvalidDimension; } if ((block_size % 16 != 0) || (block_size > 128)) { return StatusCode::kUnknownError; } // Start at zero for (size_t i =0; i < args.m; ++i) { for (size_t j = 0; j < args.n; ++j) { buffers_host.b_mat[j * args.m + i] = T{0.0}; } } // Loops over the amount of diagonal blocks of size args.m by args.m each for (auto block_id = size_t{0}; block_id < num_blocks; ++block_id) { const auto a_offset = block_id * (block_size + a_ld * block_size) + args.a_offset; const auto b_offset = block_id * block_size * block_size; // Inverts the diagonal elements of the matrix for (auto i = size_t{0}; i < block_size; ++i) { auto a_value = T{1.0}; if (args.diagonal == Diagonal::kNonUnit) { if (i + block_id * block_size < args.n) { if (buffers_host.a_mat[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; } a_value = T{1.0} / buffers_host.a_mat[i * a_ld + i + a_offset]; } } buffers_host.b_mat[i * b_ld + i + b_offset] = a_value; } // Inverts the upper triangle row by row if (is_upper) { for (int i = static_cast(block_size) - 2; i >= 0; --i) { for (auto j = static_cast(block_size) - 1; j > i; --j) { auto sum = T{0.0}; for (auto k = i + 1; k <= j; ++k) { auto a_value = T{0.0}; if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) { a_value = buffers_host.a_mat[k * a_ld + i + a_offset]; } sum += a_value * buffers_host.b_mat[j * b_ld + k + b_offset]; } buffers_host.b_mat[j * b_ld + i + b_offset] = - sum * buffers_host.b_mat[i * b_ld + i + b_offset]; } } } // Inverts the lower triangle row by row else { for (auto i = size_t{1}; i < block_size; ++i) { for (auto j = size_t{0}; j < i; ++j) { auto sum = T{0.0}; for (auto k = j; k < i; ++k) { auto a_value = T{0.0}; if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) { a_value = buffers_host.a_mat[k * a_ld + i + a_offset]; } sum += a_value * buffers_host.b_mat[j * b_ld + k + b_offset]; } buffers_host.b_mat[j * b_ld + i + b_offset] = - sum * buffers_host.b_mat[i * b_ld + i + b_offset]; } } } } return StatusCode::kSuccess; } // Half-precision version calling the above reference implementation after conversions template <> StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat); auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat); auto dummy = std::vector(0); auto dummy_uint = std::vector(0); auto buffers2 = BuffersHost{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy, dummy_uint}; auto args2 = Arguments(); args2.a_size = args.a_size; args2.b_size = args.b_size; args2.a_ld = args.a_ld; args2.m = args.m; args2.n = args.n; args2.a_offset = args.a_offset; args2.layout = args.layout; args2.triangle = args.triangle; args2.diagonal = args.diagonal; auto status = RunReference(args2, buffers2); FloatToHalfBuffer(buffers_host.b_mat, b_buffer2); return status; } // ================================================================================================= // See comment at top of file for a description of the class template class TestXinvert { public: // The BLAS level: 4 for the extra routines static size_t BLASLevel() { return 4; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgN, kArgM, kArgLayout, kArgTriangle, kArgDiagonal, kArgALeadDim, kArgAOffset}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB}; } static std::vector BuffersOut() { return {kBufMatB}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { return args.n * args.a_ld + args.a_offset; } static size_t GetSizeB(const Arguments &args) { const auto block_size = args.m; const auto num_blocks = CeilDiv(args.n, block_size); return num_blocks * block_size * block_size; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which omatcopyose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { try { #ifdef OPENCL_API auto event = cl_event{}; auto inverter = Xinvert(queue, &event); inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal, args.n, args.m, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat); clWaitForEvents(1, &event); clReleaseEvent(event); #elif CUDA_API auto inverter = Xinvert(queue, nullptr); inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal, args.n, args.m, buffers.a_mat, args.a_offset, args.a_ld, buffers.b_mat); cuStreamSynchronize(queue()); #endif } catch (...) { return DispatchException(); } return StatusCode::kSuccess; } // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto buffers_host = BuffersHost(); DeviceToHost(args, buffers, buffers_host, queue, BuffersIn()); const auto status = RunReference(args, buffers_host); HostToDevice(args, buffers, buffers_host, queue, BuffersOut()); return status; } static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue&) { return RunReference(args, buffers_host); } static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { return StatusCode::kUnknownError; } // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.b_size, static_cast(0)); buffers.b_mat.Read(queue, args.b_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.m; } static size_t ResultID2(const Arguments &args) { return Ceil(args.n, args.m); } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { return id1 * Ceil(args.n, args.m) + id2; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { const auto block_size = args.m; const auto num_blocks = CeilDiv(args.n, block_size); return num_blocks * (block_size * (block_size / 2) * (block_size / 2)); } static size_t GetBytes(const Arguments &args) { return (args.a_size * args.b_size) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XINVERT_H_ #endif CLBlast-1.6.3/test/routines/levelx/xomatcopy.hpp000066400000000000000000000227401463263031500216770ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a class with static methods to describe the Xomatcopy routine. Examples of // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These // static methods are used by the correctness tester and the performance tester. // // ================================================================================================= #ifndef CLBLAST_TEST_ROUTINES_XOMATCOPY_H_ #define CLBLAST_TEST_ROUTINES_XOMATCOPY_H_ #include "test/routines/common.hpp" namespace clblast { // ================================================================================================= template StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { // Checking for invalid arguments const auto a_rotated = (args.layout == Layout::kRowMajor); const auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); const auto a_base = (a_rotated) ? args.a_ld*(args.m-1) + args.n : args.a_ld*(args.n-1) + args.m; const auto b_base = (b_rotated) ? args.b_ld*(args.m-1) + args.n : args.b_ld*(args.n-1) + args.m; if ((args.m == 0) || (args.n == 0)) { return StatusCode::kInvalidDimension; } if ((args.a_ld < args.m && !a_rotated) || (args.a_ld < args.n && a_rotated)) { return StatusCode::kInvalidLeadDimA; } if ((args.b_ld < args.m && !b_rotated) || (args.b_ld < args.n && b_rotated)) { return StatusCode::kInvalidLeadDimB; } if (buffers_host.a_mat.size() * sizeof(T) < (a_base + args.a_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryA; } if (buffers_host.b_mat.size() * sizeof(T) < (b_base + args.b_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryB; } // Matrix copy, scaling, and/or transpose for (auto id1 = size_t{0}; id1 < args.m; ++id1) { for (auto id2 = size_t{0}; id2 < args.n; ++id2) { const auto a_one = (a_rotated) ? id2 : id1; const auto a_two = (a_rotated) ? id1 : id2; const auto b_one = (b_rotated) ? id2 : id1; const auto b_two = (b_rotated) ? id1 : id2; const auto a_index = a_two * args.a_ld + a_one + args.a_offset; const auto b_index = b_two * args.b_ld + b_one + args.b_offset; auto a_value = buffers_host.a_mat[a_index]; if (args.a_transpose == Transpose::kConjugate) { a_value = ComplexConjugate(a_value); } buffers_host.b_mat[b_index] = args.alpha * a_value; } } return StatusCode::kSuccess; } // Half-precision version calling the above reference implementation after conversions template <> StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) { auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat); auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat); auto dummy = std::vector(0); auto dummy_uint = std::vector(0); auto buffers2 = BuffersHost{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy, dummy_uint}; auto args2 = Arguments(); args2.a_size = args.a_size; args2.b_size = args.b_size; args2.a_ld = args.a_ld; args2.b_ld = args.b_ld; args2.m = args.m; args2.n = args.n; args2.a_offset = args.a_offset; args2.b_offset = args.b_offset; args2.layout = args.layout; args2.a_transpose = args.a_transpose; args2.alpha = HalfToFloat(args.alpha); auto status = RunReference(args2, buffers2); FloatToHalfBuffer(buffers_host.b_mat, buffers2.b_mat); return status; } // ================================================================================================= // See comment at top of file for a description of the class template class TestXomatcopy { public: // The BLAS level: 4 for the extra routines static size_t BLASLevel() { return 4; } // The list of arguments relevant for this routine static std::vector GetOptions() { return {kArgM, kArgN, kArgLayout, kArgATransp, kArgALeadDim, kArgBLeadDim, kArgAOffset, kArgBOffset, kArgAlpha}; } static std::vector BuffersIn() { return {kBufMatA, kBufMatB}; } static std::vector BuffersOut() { return {kBufMatB}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments &args) { const auto a_rotated = (args.layout == Layout::kRowMajor); const auto a_two = (a_rotated) ? args.m : args.n; return a_two * args.a_ld + args.a_offset; } static size_t GetSizeB(const Arguments &args) { const auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); const auto b_two = (b_rotated) ? args.n : args.m; return b_two * args.b_ld + args.b_offset; } // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args, Queue&) { args.a_size = GetSizeA(args); args.b_size = GetSizeB(args); } // Describes what the default values of the leading dimensions of the matrices are static size_t DefaultLDA(const Arguments &args) { return args.n; } static size_t DefaultLDB(const Arguments &args) { return args.m; } static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes which transpose options are relevant for this routine using Transposes = std::vector; static Transposes GetATransposes(const Transposes &all) { return all; } static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to prepare the input data static void PrepareData(const Arguments&, Queue&, const int, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Omatcopy(args.layout, args.a_transpose, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Omatcopy(args.layout, args.a_transpose, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; } // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto buffers_host = BuffersHost(); DeviceToHost(args, buffers, buffers_host, queue, BuffersIn()); const auto status = RunReference(args, buffers_host); HostToDevice(args, buffers, buffers_host, queue, BuffersOut()); return status; } static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue&) { return RunReference(args, buffers_host); } static StatusCode RunReference3(const Arguments &, BuffersCUDA &, Queue &) { return StatusCode::kUnknownError; } // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.b_size, static_cast(0)); buffers.b_mat.Read(queue, args.b_size, result); return result; } // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.m; } static size_t ResultID2(const Arguments &args) { return args.n; } static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { const auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); const auto b_one = (b_rotated) ? id2 : id1; const auto b_two = (b_rotated) ? id1 : id2; return b_two * args.b_ld + b_one + args.b_offset; } // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { return args.m*args.n; } static size_t GetBytes(const Arguments &args) { return (2*args.m*args.n) * sizeof(T); } }; // ================================================================================================= } // namespace clblast // CLBLAST_TEST_ROUTINES_XOMATCOPY_H_ #endif CLBlast-1.6.3/test/test_utilities.cpp000066400000000000000000000312471463263031500175540ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the test utility functions. // // ================================================================================================= #include #include #include #include #include "test/test_utilities.hpp" namespace clblast { // ================================================================================================= // Returns whether a scalar is close to zero template bool IsCloseToZero(const T value) { return (value > -SmallConstant()) && (value < SmallConstant()); } template bool IsCloseToZero(const float); template bool IsCloseToZero(const double); template <> bool IsCloseToZero(const half value) { return IsCloseToZero(HalfToFloat(value)); } template <> bool IsCloseToZero(const float2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); } template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); } // ================================================================================================= // Performs a complex conjugate if complex template T ComplexConjugate(const T value) { return value; } template half ComplexConjugate(const half); template float ComplexConjugate(const float); template double ComplexConjugate(const double); template <> float2 ComplexConjugate(const float2 value) { return float2{value.real(), -value.imag()}; } template <> double2 ComplexConjugate(const double2 value) { return double2{value.real(), -value.imag()}; } // ================================================================================================= template void DeviceToHost(const Arguments &args, Buffers &buffers, BuffersHost &buffers_host, Queue &queue, const std::vector &names) { for (auto &name: names) { if (name == kBufVecX) {buffers_host.x_vec = std::vector(args.x_size, static_cast(0)); buffers.x_vec.Read(queue, args.x_size, buffers_host.x_vec); } else if (name == kBufVecY) { buffers_host.y_vec = std::vector(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, buffers_host.y_vec); } else if (name == kBufMatA) { buffers_host.a_mat = std::vector(args.a_size, static_cast(0)); buffers.a_mat.Read(queue, args.a_size, buffers_host.a_mat); } else if (name == kBufMatB) { buffers_host.b_mat = std::vector(args.b_size, static_cast(0)); buffers.b_mat.Read(queue, args.b_size, buffers_host.b_mat); } else if (name == kBufMatC) { buffers_host.c_mat = std::vector(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, buffers_host.c_mat); } else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector(args.ap_size, static_cast(0)); buffers.ap_mat.Read(queue, args.ap_size, buffers_host.ap_mat); } else if (name == kBufScalar) { buffers_host.scalar = std::vector(args.scalar_size, static_cast(0)); buffers.scalar.Read(queue, args.scalar_size, buffers_host.scalar); } else if (name == kBufScalarUint) { buffers_host.scalar_uint = std::vector(args.scalar_size, 0); buffers.scalar_uint.Read(queue, args.scalar_size, buffers_host.scalar_uint); } else { throw std::runtime_error("Invalid buffer name"); } } } template void HostToDevice(const Arguments &args, Buffers &buffers, BuffersHost &buffers_host, Queue &queue, const std::vector &names) { for (auto &name: names) { if (name == kBufVecX) { buffers.x_vec.Write(queue, args.x_size, buffers_host.x_vec); } else if (name == kBufVecY) { buffers.y_vec.Write(queue, args.y_size, buffers_host.y_vec); } else if (name == kBufMatA) { buffers.a_mat.Write(queue, args.a_size, buffers_host.a_mat); } else if (name == kBufMatB) { buffers.b_mat.Write(queue, args.b_size, buffers_host.b_mat); } else if (name == kBufMatC) { buffers.c_mat.Write(queue, args.c_size, buffers_host.c_mat); } else if (name == kBufMatAP) { buffers.ap_mat.Write(queue, args.ap_size, buffers_host.ap_mat); } else if (name == kBufScalar) { buffers.scalar.Write(queue, args.scalar_size, buffers_host.scalar); } else if (name == kBufScalarUint) { buffers.scalar_uint.Write(queue, args.scalar_size, buffers_host.scalar_uint); } else { throw std::runtime_error("Invalid buffer name"); } } } // Compiles the above functions template void DeviceToHost(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void DeviceToHost(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void DeviceToHost(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void DeviceToHost(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void DeviceToHost(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void DeviceToHost(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void DeviceToHost(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void HostToDevice(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void HostToDevice(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void HostToDevice(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void HostToDevice(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void HostToDevice(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void HostToDevice(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); template void HostToDevice(const Arguments&, Buffers&, BuffersHost&, Queue&, const std::vector&); // ================================================================================================= // Conversion between half and single-precision std::vector HalfToFloatBuffer(const std::vector& source) { auto result = std::vector(source.size()); for (auto i = size_t(0); i < source.size(); ++i) { result[i] = HalfToFloat(source[i]); } return result; } void FloatToHalfBuffer(std::vector& result, const std::vector& source) { for (auto i = size_t(0); i < source.size(); ++i) { result[i] = FloatToHalf(source[i]); } } // As above, but now for OpenCL data-types instead of std::vectors #ifdef OPENCL_API Buffer HalfToFloatBuffer(const Buffer& source, RawCommandQueue queue_raw) { const auto size = source.GetSize() / sizeof(half); auto queue = Queue(queue_raw); auto context = queue.GetContext(); auto source_cpu = std::vector(size); source.Read(queue, size, source_cpu); auto result_cpu = HalfToFloatBuffer(source_cpu); auto result = Buffer(context, size); result.Write(queue, size, result_cpu); return result; } void FloatToHalfBuffer(Buffer& result, const Buffer& source, RawCommandQueue queue_raw) { const auto size = source.GetSize() / sizeof(float); auto queue = Queue(queue_raw); auto context = queue.GetContext(); auto source_cpu = std::vector(size); source.Read(queue, size, source_cpu); auto result_cpu = std::vector(size); FloatToHalfBuffer(result_cpu, source_cpu); result.Write(queue, size, result_cpu); } #endif // ================================================================================================= void OverrideParametersFromJSONFiles(const std::vector& file_names, const RawDeviceID device, const Precision precision) { // Retrieves the best parameters for each file from disk BestParametersCollection all_parameters; for (const auto& json_file_name : file_names) { GetBestParametersFromJSONFile(json_file_name, all_parameters, precision); } // Applies the parameter override for (const auto &best_parameters : all_parameters) { const auto kernel_family = best_parameters.first; const auto parameters = best_parameters.second; const auto status = OverrideParameters(device, kernel_family, precision, parameters); if (status == StatusCode::kSuccess) { fprintf(stdout, "* Applying parameter override successfully for '%s'\n", kernel_family.c_str()); } else { fprintf(stdout, "* Error while applying parameter override for '%s'\n", kernel_family.c_str()); } } if (file_names.size() > 0) { fprintf(stdout, "\n"); } } void GetBestParametersFromJSONFile(const std::string& file_name, BestParametersCollection& all_parameters, const Precision precision) { std::ifstream json_file(file_name); if (!json_file) { fprintf(stdout, "* Could not open file '%s'\n", file_name.c_str()); return; } fprintf(stdout, "* Reading override-parameters from '%s'\n", file_name.c_str()); std::string line; auto kernel_family = std::string{}; while (std::getline(json_file, line)) { const auto line_split = split(line, ':'); if (line_split.size() != 2) { continue; } // Retrieves the kernel name if (line_split[0] == " \"kernel_family\"") { const auto value_split = split(line_split[1], '\"'); if (value_split.size() != 3) { break; } kernel_family = value_split[1]; kernel_family[0] = toupper(kernel_family[0]); // because of a tuner - database naming mismatch kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '_'), kernel_family.end()); kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '1'), kernel_family.end()); kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '2'), kernel_family.end()); kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '3'), kernel_family.end()); if (kernel_family == "Xgemmdirect") { kernel_family = "XgemmDirect"; } // more kinds of mismatches } // Retrieves the best-parameters and sets the override if (line_split[0] == " \"best_parameters\"" && kernel_family != std::string{""}) { const auto value_split = split(line_split[1], '\"'); if (value_split.size() != 3) { break; } const auto config_split = split(value_split[1], ' '); if (config_split.size() == 0) { break; } // Loads an existing list of parameters for this kernel family (if present) BestParameters parameters; if (all_parameters.count(kernel_family) == 1) { parameters = all_parameters.at(kernel_family); } // Creates the list of parameters fprintf(stdout, "* Found parameters for kernel '%s': { ", kernel_family.c_str()); for (const auto& config : config_split) { const auto params_split = split(config, '='); if (params_split.size() != 2) { break; } const auto parameter_name = params_split[0]; const auto parameter_value = static_cast(std::stoi(params_split[1].c_str())); if (parameter_name != "PRECISION") { printf("%s=%zu ", parameter_name.c_str(), parameter_value); parameters[parameter_name] = parameter_value; } else { if (static_cast(precision) != parameter_value) { fprintf(stdout, "ERROR! }\n"); fprintf(stdout, "* Precision is not matching, continuing\n"); json_file.close(); return; } } } fprintf(stdout, "}\n"); // Sets the new (possibly extended) parameter map as the final result all_parameters[kernel_family] = parameters; json_file.close(); return; } } // Ends this function (failure) fprintf(stdout, "* Failed to extract parameters from this file, continuing\n"); json_file.close(); } // ================================================================================================= } // namespace clblast CLBlast-1.6.3/test/test_utilities.hpp000066400000000000000000000123051463263031500175530ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file provides declarations for the common test utility functions (performance clients and // correctness testers). // // ================================================================================================= #ifndef CLBLAST_TEST_UTILITIES_H_ #define CLBLAST_TEST_UTILITIES_H_ #include #include #include #include #include #include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= // The client-specific arguments in string form constexpr auto kArgCompareclblas = "clblas"; constexpr auto kArgComparecblas = "cblas"; constexpr auto kArgComparecublas = "cublas"; constexpr auto kArgStepSize = "step"; constexpr auto kArgNumSteps = "num_steps"; constexpr auto kArgWarmUp = "warm_up"; constexpr auto kArgTunerFiles = "tuner_files"; // The test-specific arguments in string form constexpr auto kArgFullTest = "full_test"; constexpr auto kArgVerbose = "verbose"; // ================================================================================================= // Returns whether a scalar is close to zero template bool IsCloseToZero(const T value); // ================================================================================================= // Structure containing all possible buffers for test clients template struct Buffers { Buffer x_vec; Buffer y_vec; Buffer a_mat; Buffer b_mat; Buffer c_mat; Buffer ap_mat; Buffer scalar; Buffer scalar_uint; }; template struct BuffersHost { std::vector x_vec; std::vector y_vec; std::vector a_mat; std::vector b_mat; std::vector c_mat; std::vector ap_mat; std::vector scalar; std::vector scalar_uint; }; // ================================================================================================= template T ComplexConjugate(const T value); // ================================================================================================= // Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast // data-types such as the Layout and Transpose data-types. template std::string ToString(T value); // ================================================================================================= // Copies buffers from the OpenCL device to the host template void DeviceToHost(const Arguments &args, Buffers &buffers, BuffersHost &buffers_host, Queue &queue, const std::vector &names); // Copies buffers from the host to the OpenCL device template void HostToDevice(const Arguments &args, Buffers &buffers, BuffersHost &buffers_host, Queue &queue, const std::vector &names); // ================================================================================================= // Conversion between half and single-precision std::vector HalfToFloatBuffer(const std::vector& source); void FloatToHalfBuffer(std::vector& result, const std::vector& source); // As above, but now for OpenCL data-types instead of std::vectors #ifdef OPENCL_API Buffer HalfToFloatBuffer(const Buffer& source, RawCommandQueue queue_raw); void FloatToHalfBuffer(Buffer& result, const Buffer& source, RawCommandQueue queue_raw); #endif // ================================================================================================= // Creates a buffer but don't test for validity. That's the reason this is not using the clpp11.h or // cupp11.h interface. template Buffer CreateInvalidBuffer(const Context& context, const size_t size) { #ifdef OPENCL_API auto raw_buffer = clCreateBuffer(context(), CL_MEM_READ_WRITE, size * sizeof(T), nullptr, nullptr); #elif CUDA_API CUdeviceptr raw_buffer; cuMemAlloc(&raw_buffer, size * sizeof(T)); #endif return Buffer(raw_buffer); } // ================================================================================================= using BestParameters = std::unordered_map; using BestParametersCollection = std::unordered_map; void OverrideParametersFromJSONFiles(const std::vector& file_names, const RawDeviceID device, const Precision precision); void GetBestParametersFromJSONFile(const std::string& file_name, BestParametersCollection& all_parameters, const Precision precision); // ================================================================================================= } // namespace clblast // CLBLAST_TEST_UTILITIES_H_ #endif CLBlast-1.6.3/test/wrapper_cblas.hpp000066400000000000000000003437371463263031500173450ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a wrapper around a CPU BLAS library, such that its routines can be called // in a similar way as the CLBlast routines: using alpha and beta to determine the precision. // // ================================================================================================= #ifndef CLBLAST_TEST_WRAPPER_CBLAS_H_ #define CLBLAST_TEST_WRAPPER_CBLAS_H_ extern "C" { #ifdef CLBLAST_REF_CBLAS_MKL #include #else #include #endif } #include "utilities/utilities.hpp" namespace clblast { // Conversions from CLBlast types CBLAS_ORDER convertToCBLAS(const Layout v) { return (v == Layout::kRowMajor) ? CblasRowMajor : CblasColMajor; } CBLAS_TRANSPOSE convertToCBLAS(const Transpose v) { return (v == Transpose::kNo) ? CblasNoTrans : (v == Transpose::kYes) ? CblasTrans : CblasConjTrans; } CBLAS_UPLO convertToCBLAS(const Triangle v) { return (v == Triangle::kUpper) ? CblasUpper : CblasLower; } CBLAS_DIAG convertToCBLAS(const Diagonal v) { return (v == Diagonal::kUnit) ? CblasUnit : CblasNonUnit; } CBLAS_SIDE convertToCBLAS(const Side v) { return (v == Side::kLeft) ? CblasLeft : CblasRight; } // OpenBLAS is not fully Netlib CBLAS compatible #ifdef OPENBLAS_VERSION using return_pointer_float = openblas_complex_float*; using return_pointer_double = openblas_complex_double*; #else using return_pointer_float = void*; using return_pointer_double = void*; #endif // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // Forwards the Netlib BLAS calls for SROTG/DROTG void cblasXrotg(std::vector& sa_buffer, const size_t sa_offset, std::vector& sb_buffer, const size_t sb_offset, std::vector& sc_buffer, const size_t sc_offset, std::vector& ss_buffer, const size_t ss_offset) { cblas_srotg(&sa_buffer[sa_offset], &sb_buffer[sb_offset], &sc_buffer[sc_offset], &ss_buffer[ss_offset]); } void cblasXrotg(std::vector& sa_buffer, const size_t sa_offset, std::vector& sb_buffer, const size_t sb_offset, std::vector& sc_buffer, const size_t sc_offset, std::vector& ss_buffer, const size_t ss_offset) { cblas_drotg(&sa_buffer[sa_offset], &sb_buffer[sb_offset], &sc_buffer[sc_offset], &ss_buffer[ss_offset]); } // Forwards the Netlib BLAS calls for SROTMG/DROTMG void cblasXrotmg(std::vector& sd1_buffer, const size_t sd1_offset, std::vector& sd2_buffer, const size_t sd2_offset, std::vector& sx1_buffer, const size_t sx1_offset, const std::vector& sy1_buffer, const size_t sy1_offset, std::vector& sparam_buffer, const size_t sparam_offset) { cblas_srotmg(&sd1_buffer[sd1_offset], &sd2_buffer[sd2_offset], &sx1_buffer[sx1_offset], sy1_buffer[sy1_offset], &sparam_buffer[sparam_offset]); } void cblasXrotmg(std::vector& sd1_buffer, const size_t sd1_offset, std::vector& sd2_buffer, const size_t sd2_offset, std::vector& sx1_buffer, const size_t sx1_offset, const std::vector& sy1_buffer, const size_t sy1_offset, std::vector& sparam_buffer, const size_t sparam_offset) { cblas_drotmg(&sd1_buffer[sd1_offset], &sd2_buffer[sd2_offset], &sx1_buffer[sx1_offset], sy1_buffer[sy1_offset], &sparam_buffer[sparam_offset]); } // Forwards the Netlib BLAS calls for SROT/DROT void cblasXrot(const size_t n, std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc, const float cos, const float sin) { cblas_srot(static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), cos, sin); } void cblasXrot(const size_t n, std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc, const double cos, const double sin) { cblas_drot(static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), cos, sin); } // Forwards the Netlib BLAS calls for SROTM/DROTM void cblasXrotm(const size_t n, std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& sparam_buffer, const size_t sparam_offset) { cblas_srotm(static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &sparam_buffer[sparam_offset]); } void cblasXrotm(const size_t n, std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& sparam_buffer, const size_t sparam_offset) { cblas_drotm(static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &sparam_buffer[sparam_offset]); } // Forwards the Netlib BLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP void cblasXswap(const size_t n, std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_sswap(static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); } void cblasXswap(const size_t n, std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_dswap(static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); } void cblasXswap(const size_t n, std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_cswap(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXswap(const size_t n, std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_zswap(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXswap(const size_t n, std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto y_buffer_bis = HalfToFloatBuffer(y_buffer); cblasXswap(n, x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc); FloatToHalfBuffer(x_buffer, x_buffer_bis); FloatToHalfBuffer(y_buffer, y_buffer_bis); } // Forwards the Netlib BLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL void cblasXscal(const size_t n, const float alpha, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_sscal(static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc)); } void cblasXscal(const size_t n, const double alpha, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dscal(static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc)); } void cblasXscal(const size_t n, const float2 alpha, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_cscal(static_cast(n), alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXscal(const size_t n, const double2 alpha, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_zscal(static_cast(n), alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXscal(const size_t n, const half alpha, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); cblasXscal(n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc); FloatToHalfBuffer(x_buffer, x_buffer_bis); } // Forwards the Netlib BLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY void cblasXcopy(const size_t n, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_scopy(static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); } void cblasXcopy(const size_t n, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_dcopy(static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); } void cblasXcopy(const size_t n, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_ccopy(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXcopy(const size_t n, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_zcopy(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXcopy(const size_t n, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto y_buffer_bis = HalfToFloatBuffer(y_buffer); cblasXcopy(n, x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc); FloatToHalfBuffer(y_buffer, y_buffer_bis); } // Forwards the Netlib BLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY void cblasXaxpy(const size_t n, const float alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_saxpy(static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); } void cblasXaxpy(const size_t n, const double alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_daxpy(static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); } void cblasXaxpy(const size_t n, const float2 alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_caxpy(static_cast(n), alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXaxpy(const size_t n, const double2 alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_zaxpy(static_cast(n), alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXaxpy(const size_t n, const half alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto y_buffer_bis = HalfToFloatBuffer(y_buffer); cblasXaxpy(n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc); FloatToHalfBuffer(y_buffer, y_buffer_bis); } // Forwards the Netlib BLAS calls for SDOT/DDOT void cblasXdot(const size_t n, std::vector& dot_buffer, const size_t dot_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { dot_buffer[dot_offset] = cblas_sdot(static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); } void cblasXdot(const size_t n, std::vector& dot_buffer, const size_t dot_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { dot_buffer[dot_offset] = cblas_ddot(static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); } void cblasXdot(const size_t n, std::vector& dot_buffer, const size_t dot_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto y_buffer_bis = HalfToFloatBuffer(y_buffer); auto dot_buffer_bis = HalfToFloatBuffer(dot_buffer); cblasXdot(n, dot_buffer_bis, dot_offset, x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc); FloatToHalfBuffer(dot_buffer, dot_buffer_bis); } // Forwards the Netlib BLAS calls for CDOTU/ZDOTU void cblasXdotu(const size_t n, std::vector& dot_buffer, const size_t dot_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_cdotu_sub(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&dot_buffer[dot_offset])); } void cblasXdotu(const size_t n, std::vector& dot_buffer, const size_t dot_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_zdotu_sub(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&dot_buffer[dot_offset])); } // Forwards the Netlib BLAS calls for CDOTC/ZDOTC void cblasXdotc(const size_t n, std::vector& dot_buffer, const size_t dot_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_cdotc_sub(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&dot_buffer[dot_offset])); } void cblasXdotc(const size_t n, std::vector& dot_buffer, const size_t dot_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_zdotc_sub(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&dot_buffer[dot_offset])); } // Forwards the Netlib BLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2 void cblasXnrm2(const size_t n, std::vector& nrm2_buffer, const size_t nrm2_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { nrm2_buffer[nrm2_offset] = cblas_snrm2(static_cast(n), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXnrm2(const size_t n, std::vector& nrm2_buffer, const size_t nrm2_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { nrm2_buffer[nrm2_offset] = cblas_dnrm2(static_cast(n), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXnrm2(const size_t n, std::vector& nrm2_buffer, const size_t nrm2_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { nrm2_buffer[nrm2_offset].real(cblas_scnrm2(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc))); } void cblasXnrm2(const size_t n, std::vector& nrm2_buffer, const size_t nrm2_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { nrm2_buffer[nrm2_offset].real(cblas_dznrm2(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc))); } void cblasXnrm2(const size_t n, std::vector& nrm2_buffer, const size_t nrm2_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto nrm2_buffer_bis = HalfToFloatBuffer(nrm2_buffer); cblasXnrm2(n, nrm2_buffer_bis, nrm2_offset, x_buffer_bis, x_offset, x_inc); FloatToHalfBuffer(nrm2_buffer, nrm2_buffer_bis); } // Forwards the Netlib BLAS calls for SASUM/DASUM/ScASUM/DzASUM void cblasXasum(const size_t n, std::vector& asum_buffer, const size_t asum_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { asum_buffer[asum_offset] = cblas_sasum(static_cast(n), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXasum(const size_t n, std::vector& asum_buffer, const size_t asum_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { asum_buffer[asum_offset] = cblas_dasum(static_cast(n), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXasum(const size_t n, std::vector& asum_buffer, const size_t asum_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { asum_buffer[asum_offset].real(cblas_scasum(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc))); } void cblasXasum(const size_t n, std::vector& asum_buffer, const size_t asum_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { asum_buffer[asum_offset].real(cblas_dzasum(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc))); } void cblasXasum(const size_t n, std::vector& asum_buffer, const size_t asum_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto asum_buffer_bis = HalfToFloatBuffer(asum_buffer); cblasXasum(n, asum_buffer_bis, asum_offset, x_buffer_bis, x_offset, x_inc); FloatToHalfBuffer(asum_buffer, asum_buffer_bis); } // Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX void cblasXamax(const size_t n, std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { imax_buffer[imax_offset] = cblas_isamax(static_cast(n), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXamax(const size_t n, std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { imax_buffer[imax_offset] = cblas_idamax(static_cast(n), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXamax(const size_t n, std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { imax_buffer[imax_offset] = cblas_icamax(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXamax(const size_t n, std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { imax_buffer[imax_offset] = cblas_izamax(static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXamax(const size_t n, std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto imax_buffer_bis = imax_buffer; cblasXamax(n, imax_buffer_bis, imax_offset, x_buffer_bis, x_offset, x_inc); } // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // Forwards the Netlib BLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const size_t m, const size_t n, const float alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_sgemv(layout, a_transpose, static_cast(m), static_cast(n), alpha, &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc), beta, &y_buffer[y_offset], static_cast(y_inc)); } void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const size_t m, const size_t n, const double alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_dgemv(layout, a_transpose, static_cast(m), static_cast(n), alpha, &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc), beta, &y_buffer[y_offset], static_cast(y_inc)); } void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const size_t m, const size_t n, const float2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_cgemv(layout, a_transpose, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), beta_array.data(), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const size_t m, const size_t n, const double2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_zgemv(layout, a_transpose, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), beta_array.data(), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const size_t m, const size_t n, const half alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer); auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto y_buffer_bis = HalfToFloatBuffer(y_buffer); cblasXgemv(layout, a_transpose, m, n, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, x_buffer_bis, x_offset, x_inc, HalfToFloat(beta), y_buffer_bis, y_offset, y_inc); FloatToHalfBuffer(y_buffer, y_buffer_bis); } // Forwards the Netlib BLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_sgbmv(layout, a_transpose, static_cast(m), static_cast(n), static_cast(kl), static_cast(ku), alpha, &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc), beta, &y_buffer[y_offset], static_cast(y_inc)); } void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_dgbmv(layout, a_transpose, static_cast(m), static_cast(n), static_cast(kl), static_cast(ku), alpha, &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc), beta, &y_buffer[y_offset], static_cast(y_inc)); } void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_cgbmv(layout, a_transpose, static_cast(m), static_cast(n), static_cast(kl), static_cast(ku), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), beta_array.data(), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_zgbmv(layout, a_transpose, static_cast(m), static_cast(n), static_cast(kl), static_cast(ku), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), beta_array.data(), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const half alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer); auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto y_buffer_bis = HalfToFloatBuffer(y_buffer); cblasXgbmv(layout, a_transpose, m, n, kl, ku, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, x_buffer_bis, x_offset, x_inc, HalfToFloat(beta), y_buffer_bis, y_offset, y_inc); FloatToHalfBuffer(y_buffer, y_buffer_bis); } // Forwards the Netlib BLAS calls for CHEMV/ZHEMV void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const float2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_chemv(layout, triangle, static_cast(n), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), beta_array.data(), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const double2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_zhemv(layout, triangle, static_cast(n), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), beta_array.data(), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } // Forwards the Netlib BLAS calls for CHBMV/ZHBMV void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const size_t k, const float2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_chbmv(layout, triangle, static_cast(n), static_cast(k), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), beta_array.data(), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const size_t k, const double2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_zhbmv(layout, triangle, static_cast(n), static_cast(k), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), beta_array.data(), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } // Forwards the Netlib BLAS calls for CHPMV/ZHPMV void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const float2 alpha, const std::vector& ap_buffer, const size_t ap_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_chpmv(layout, triangle, static_cast(n), alpha_array.data(), reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), beta_array.data(), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const double2 alpha, const std::vector& ap_buffer, const size_t ap_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_zhpmv(layout, triangle, static_cast(n), alpha_array.data(), reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), beta_array.data(), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } // Forwards the Netlib BLAS calls for SSYMV/DSYMV void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const float alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_ssymv(layout, triangle, static_cast(n), alpha, &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc), beta, &y_buffer[y_offset], static_cast(y_inc)); } void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const double alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_dsymv(layout, triangle, static_cast(n), alpha, &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc), beta, &y_buffer[y_offset], static_cast(y_inc)); } void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const half alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer); auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto y_buffer_bis = HalfToFloatBuffer(y_buffer); cblasXsymv(layout, triangle, n, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, x_buffer_bis, x_offset, x_inc, HalfToFloat(beta), y_buffer_bis, y_offset, y_inc); FloatToHalfBuffer(y_buffer, y_buffer_bis); } // Forwards the Netlib BLAS calls for SSBMV/DSBMV void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const size_t k, const float alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_ssbmv(layout, triangle, static_cast(n), static_cast(k), alpha, &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc), beta, &y_buffer[y_offset], static_cast(y_inc)); } void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const size_t k, const double alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_dsbmv(layout, triangle, static_cast(n), static_cast(k), alpha, &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc), beta, &y_buffer[y_offset], static_cast(y_inc)); } void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const size_t k, const half alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer); auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto y_buffer_bis = HalfToFloatBuffer(y_buffer); cblasXsbmv(layout, triangle, n, k, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, x_buffer_bis, x_offset, x_inc, HalfToFloat(beta), y_buffer_bis, y_offset, y_inc); FloatToHalfBuffer(y_buffer, y_buffer_bis); } // Forwards the Netlib BLAS calls for SSPMV/DSPMV void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const float alpha, const std::vector& ap_buffer, const size_t ap_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_sspmv(layout, triangle, static_cast(n), alpha, &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast(x_inc), beta, &y_buffer[y_offset], static_cast(y_inc)); } void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const double alpha, const std::vector& ap_buffer, const size_t ap_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_dspmv(layout, triangle, static_cast(n), alpha, &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast(x_inc), beta, &y_buffer[y_offset], static_cast(y_inc)); } void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const half alpha, const std::vector& ap_buffer, const size_t ap_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer); auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto y_buffer_bis = HalfToFloatBuffer(y_buffer); cblasXspmv(layout, triangle, n, HalfToFloat(alpha), ap_buffer_bis, ap_offset, x_buffer_bis, x_offset, x_inc, HalfToFloat(beta), y_buffer_bis, y_offset, y_inc); FloatToHalfBuffer(y_buffer, y_buffer_bis); } // Forwards the Netlib BLAS calls for STRMV/DTRMV/CTRMV/ZTRMV void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_strmv(layout, triangle, a_transpose, diagonal, static_cast(n), &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dtrmv(layout, triangle, a_transpose, diagonal, static_cast(n), &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ctrmv(layout, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ztrmv(layout, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer); auto x_buffer_bis = HalfToFloatBuffer(x_buffer); cblasXtrmv(layout, triangle, a_transpose, diagonal, n, a_buffer_bis, a_offset, a_ld, x_buffer_bis, x_offset, x_inc); FloatToHalfBuffer(x_buffer, x_buffer_bis); } // Forwards the Netlib BLAS calls for STBMV/DTBMV/CTBMV/ZTBMV void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const size_t k, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_stbmv(layout, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const size_t k, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dtbmv(layout, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const size_t k, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ctbmv(layout, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const size_t k, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ztbmv(layout, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const size_t k, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer); auto x_buffer_bis = HalfToFloatBuffer(x_buffer); cblasXtbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer_bis, a_offset, a_ld, x_buffer_bis, x_offset, x_inc); FloatToHalfBuffer(x_buffer, x_buffer_bis); } // Forwards the Netlib BLAS calls for STPMV/DTPMV/CTPMV/ZTPMV void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& ap_buffer, const size_t ap_offset, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_stpmv(layout, triangle, a_transpose, diagonal, static_cast(n), &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast(x_inc)); } void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& ap_buffer, const size_t ap_offset, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dtpmv(layout, triangle, a_transpose, diagonal, static_cast(n), &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast(x_inc)); } void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& ap_buffer, const size_t ap_offset, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ctpmv(layout, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& ap_buffer, const size_t ap_offset, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ztpmv(layout, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& ap_buffer, const size_t ap_offset, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer); auto x_buffer_bis = HalfToFloatBuffer(x_buffer); cblasXtpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer_bis, ap_offset, x_buffer_bis, x_offset, x_inc); FloatToHalfBuffer(x_buffer, x_buffer_bis); } // Forwards the Netlib BLAS calls for STRSV/DTRSV/CTRSV/ZTRSV void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_strsv(layout, triangle, a_transpose, diagonal, static_cast(n), &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dtrsv(layout, triangle, a_transpose, diagonal, static_cast(n), &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ctrsv(layout, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ztrsv(layout, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } // Forwards the Netlib BLAS calls for STBSV/DTBSV/CTBSV/ZTBSV void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const size_t k, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_stbsv(layout, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const size_t k, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dtbsv(layout, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), &a_buffer[a_offset], static_cast(a_ld), &x_buffer[x_offset], static_cast(x_inc)); } void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const size_t k, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ctbsv(layout, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const size_t k, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ztbsv(layout, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } // Forwards the Netlib BLAS calls for STPSV/DTPSV/CTPSV/ZTPSV void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& ap_buffer, const size_t ap_offset, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_stpsv(layout, triangle, a_transpose, diagonal, static_cast(n), &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast(x_inc)); } void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& ap_buffer, const size_t ap_offset, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dtpsv(layout, triangle, a_transpose, diagonal, static_cast(n), &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast(x_inc)); } void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& ap_buffer, const size_t ap_offset, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ctpsv(layout, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t n, const std::vector& ap_buffer, const size_t ap_offset, std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ztpsv(layout, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } // Forwards the Netlib BLAS calls for SGER/DGER void cblasXger(const CBLAS_ORDER layout, const size_t m, const size_t n, const float alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_sger(layout, static_cast(m), static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &a_buffer[a_offset], static_cast(a_ld)); } void cblasXger(const CBLAS_ORDER layout, const size_t m, const size_t n, const double alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_dger(layout, static_cast(m), static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &a_buffer[a_offset], static_cast(a_ld)); } void cblasXger(const CBLAS_ORDER layout, const size_t m, const size_t n, const half alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto y_buffer_bis = HalfToFloatBuffer(y_buffer); auto a_buffer_bis = HalfToFloatBuffer(a_buffer); cblasXger(layout, m, n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc, a_buffer_bis, a_offset, a_ld); FloatToHalfBuffer(a_buffer, a_buffer_bis); } // Forwards the Netlib BLAS calls for CGERU/ZGERU void cblasXgeru(const CBLAS_ORDER layout, const size_t m, const size_t n, const float2 alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_cgeru(layout, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld)); } void cblasXgeru(const CBLAS_ORDER layout, const size_t m, const size_t n, const double2 alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_zgeru(layout, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld)); } // Forwards the Netlib BLAS calls for CGERC/ZGERC void cblasXgerc(const CBLAS_ORDER layout, const size_t m, const size_t n, const float2 alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_cgerc(layout, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld)); } void cblasXgerc(const CBLAS_ORDER layout, const size_t m, const size_t n, const double2 alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_zgerc(layout, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld)); } // Forwards the Netlib BLAS calls for CHER/ZHER void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const float alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_cher(layout, triangle, static_cast(n), alpha, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld)); } void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const double alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_zher(layout, triangle, static_cast(n), alpha, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld)); } // Forwards the Netlib BLAS calls for CHPR/ZHPR void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const float alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& ap_buffer, const size_t ap_offset) { cblas_chpr(layout, triangle, static_cast(n), alpha, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&ap_buffer[ap_offset])); } void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const double alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& ap_buffer, const size_t ap_offset) { cblas_zhpr(layout, triangle, static_cast(n), alpha, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&ap_buffer[ap_offset])); } // Forwards the Netlib BLAS calls for CHER2/ZHER2 void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const float2 alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_cher2(layout, triangle, static_cast(n), alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld)); } void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const double2 alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_zher2(layout, triangle, static_cast(n), alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld)); } // Forwards the Netlib BLAS calls for CHPR2/ZHPR2 void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const float2 alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& ap_buffer, const size_t ap_offset) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_chpr2(layout, triangle, static_cast(n), alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&ap_buffer[ap_offset])); } void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const double2 alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& ap_buffer, const size_t ap_offset) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_zhpr2(layout, triangle, static_cast(n), alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&ap_buffer[ap_offset])); } // Forwards the Netlib BLAS calls for SSYR/DSYR void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const float alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_ssyr(layout, triangle, static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc), &a_buffer[a_offset], static_cast(a_ld)); } void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const double alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_dsyr(layout, triangle, static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc), &a_buffer[a_offset], static_cast(a_ld)); } void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const half alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto a_buffer_bis = HalfToFloatBuffer(a_buffer); cblasXsyr(layout, triangle, n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, a_buffer_bis, a_offset, a_ld); FloatToHalfBuffer(a_buffer, a_buffer_bis); } // Forwards the Netlib BLAS calls for SSPR/DSPR void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const float alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& ap_buffer, const size_t ap_offset) { cblas_sspr(layout, triangle, static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc), &ap_buffer[ap_offset]); } void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const double alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& ap_buffer, const size_t ap_offset) { cblas_dspr(layout, triangle, static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc), &ap_buffer[ap_offset]); } void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const half alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, std::vector& ap_buffer, const size_t ap_offset) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer); cblasXspr(layout, triangle, n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, ap_buffer_bis, ap_offset); FloatToHalfBuffer(ap_buffer, ap_buffer_bis); } // Forwards the Netlib BLAS calls for SSYR2/DSYR2 void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const float alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_ssyr2(layout, triangle, static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &a_buffer[a_offset], static_cast(a_ld)); } void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const double alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_dsyr2(layout, triangle, static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &a_buffer[a_offset], static_cast(a_ld)); } void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const half alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto y_buffer_bis = HalfToFloatBuffer(y_buffer); auto a_buffer_bis = HalfToFloatBuffer(a_buffer); cblasXsyr2(layout, triangle, n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc, a_buffer_bis, a_offset, a_ld); FloatToHalfBuffer(a_buffer, a_buffer_bis); } // Forwards the Netlib BLAS calls for SSPR2/DSPR2 void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const float alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& ap_buffer, const size_t ap_offset) { cblas_sspr2(layout, triangle, static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &ap_buffer[ap_offset]); } void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const double alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& ap_buffer, const size_t ap_offset) { cblas_dspr2(layout, triangle, static_cast(n), alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &ap_buffer[ap_offset]); } void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const size_t n, const half alpha, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, std::vector& ap_buffer, const size_t ap_offset) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); auto y_buffer_bis = HalfToFloatBuffer(y_buffer); auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer); cblasXspr2(layout, triangle, n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc, ap_buffer_bis, ap_offset); FloatToHalfBuffer(ap_buffer, ap_buffer_bis); } // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // Forwards the Netlib BLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_sgemm(layout, a_transpose, b_transpose, static_cast(m), static_cast(n), static_cast(k), alpha, &a_buffer[a_offset], static_cast(a_ld), &b_buffer[b_offset], static_cast(b_ld), beta, &c_buffer[c_offset], static_cast(c_ld)); } void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_dgemm(layout, a_transpose, b_transpose, static_cast(m), static_cast(n), static_cast(k), alpha, &a_buffer[a_offset], static_cast(a_ld), &b_buffer[b_offset], static_cast(b_ld), beta, &c_buffer[c_offset], static_cast(c_ld)); } void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, const size_t m, const size_t n, const size_t k, const float2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_cgemm(layout, a_transpose, b_transpose, static_cast(m), static_cast(n), static_cast(k), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld), beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, const size_t m, const size_t n, const size_t k, const double2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_zgemm(layout, a_transpose, b_transpose, static_cast(m), static_cast(n), static_cast(k), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld), beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, const size_t m, const size_t n, const size_t k, const half alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const half beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer); auto b_buffer_bis = HalfToFloatBuffer(b_buffer); auto c_buffer_bis = HalfToFloatBuffer(c_buffer); cblasXgemm(layout, a_transpose, b_transpose, m, n, k, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, b_buffer_bis, b_offset, b_ld, HalfToFloat(beta), c_buffer_bis, c_offset, c_ld); FloatToHalfBuffer(c_buffer, c_buffer_bis); } // Forwards the Netlib BLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const size_t m, const size_t n, const float alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_ssymm(layout, side, triangle, static_cast(m), static_cast(n), alpha, &a_buffer[a_offset], static_cast(a_ld), &b_buffer[b_offset], static_cast(b_ld), beta, &c_buffer[c_offset], static_cast(c_ld)); } void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const size_t m, const size_t n, const double alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_dsymm(layout, side, triangle, static_cast(m), static_cast(n), alpha, &a_buffer[a_offset], static_cast(a_ld), &b_buffer[b_offset], static_cast(b_ld), beta, &c_buffer[c_offset], static_cast(c_ld)); } void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const size_t m, const size_t n, const float2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_csymm(layout, side, triangle, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld), beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const size_t m, const size_t n, const double2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_zsymm(layout, side, triangle, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld), beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const size_t m, const size_t n, const half alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const half beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer); auto b_buffer_bis = HalfToFloatBuffer(b_buffer); auto c_buffer_bis = HalfToFloatBuffer(c_buffer); cblasXsymm(layout, side, triangle, m, n, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, b_buffer_bis, b_offset, b_ld, HalfToFloat(beta), c_buffer_bis, c_offset, c_ld); FloatToHalfBuffer(c_buffer, c_buffer_bis); } // Forwards the Netlib BLAS calls for CHEMM/ZHEMM void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const size_t m, const size_t n, const float2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_chemm(layout, side, triangle, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld), beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const size_t m, const size_t n, const double2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_zhemm(layout, side, triangle, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld), beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } // Forwards the Netlib BLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const size_t n, const size_t k, const float alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const float beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_ssyrk(layout, triangle, a_transpose, static_cast(n), static_cast(k), alpha, &a_buffer[a_offset], static_cast(a_ld), beta, &c_buffer[c_offset], static_cast(c_ld)); } void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const size_t n, const size_t k, const double alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const double beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_dsyrk(layout, triangle, a_transpose, static_cast(n), static_cast(k), alpha, &a_buffer[a_offset], static_cast(a_ld), beta, &c_buffer[c_offset], static_cast(c_ld)); } void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const size_t n, const size_t k, const float2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const float2 beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_csyrk(layout, triangle, a_transpose, static_cast(n), static_cast(k), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const size_t n, const size_t k, const double2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const double2 beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_zsyrk(layout, triangle, a_transpose, static_cast(n), static_cast(k), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const size_t n, const size_t k, const half alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const half beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer); auto c_buffer_bis = HalfToFloatBuffer(c_buffer); cblasXsyrk(layout, triangle, a_transpose, n, k, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, HalfToFloat(beta), c_buffer_bis, c_offset, c_ld); FloatToHalfBuffer(c_buffer, c_buffer_bis); } // Forwards the Netlib BLAS calls for CHERK/ZHERK void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const size_t n, const size_t k, const float alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const float beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_cherk(layout, triangle, a_transpose, static_cast(n), static_cast(k), alpha, reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), beta, reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const size_t n, const size_t k, const double alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const double beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_zherk(layout, triangle, a_transpose, static_cast(n), static_cast(k), alpha, reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), beta, reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } // Forwards the Netlib BLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, const size_t n, const size_t k, const float alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_ssyr2k(layout, triangle, ab_transpose, static_cast(n), static_cast(k), alpha, &a_buffer[a_offset], static_cast(a_ld), &b_buffer[b_offset], static_cast(b_ld), beta, &c_buffer[c_offset], static_cast(c_ld)); } void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, const size_t n, const size_t k, const double alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_dsyr2k(layout, triangle, ab_transpose, static_cast(n), static_cast(k), alpha, &a_buffer[a_offset], static_cast(a_ld), &b_buffer[b_offset], static_cast(b_ld), beta, &c_buffer[c_offset], static_cast(c_ld)); } void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, const size_t n, const size_t k, const float2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_csyr2k(layout, triangle, ab_transpose, static_cast(n), static_cast(k), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld), beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, const size_t n, const size_t k, const double2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; const auto beta_array = std::vector{beta.real(), beta.imag()}; cblas_zsyr2k(layout, triangle, ab_transpose, static_cast(n), static_cast(k), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld), beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, const size_t n, const size_t k, const half alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const half beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer); auto b_buffer_bis = HalfToFloatBuffer(b_buffer); auto c_buffer_bis = HalfToFloatBuffer(c_buffer); cblasXsyr2k(layout, triangle, ab_transpose, n, k, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, b_buffer_bis, b_offset, b_ld, HalfToFloat(beta), c_buffer_bis, c_offset, c_ld); FloatToHalfBuffer(c_buffer, c_buffer_bis); } // Forwards the Netlib BLAS calls for CHER2K/ZHER2K void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, const size_t n, const size_t k, const float2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_cher2k(layout, triangle, ab_transpose, static_cast(n), static_cast(k), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld), beta, reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, const size_t n, const size_t k, const double2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_zher2k(layout, triangle, ab_transpose, static_cast(n), static_cast(k), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld), beta, reinterpret_cast(&c_buffer[c_offset]), static_cast(c_ld)); } // Forwards the Netlib BLAS calls for STRMM/DTRMM/CTRMM/ZTRMM void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t m, const size_t n, const float alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { cblas_strmm(layout, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), alpha, &a_buffer[a_offset], static_cast(a_ld), &b_buffer[b_offset], static_cast(b_ld)); } void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t m, const size_t n, const double alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { cblas_dtrmm(layout, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), alpha, &a_buffer[a_offset], static_cast(a_ld), &b_buffer[b_offset], static_cast(b_ld)); } void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t m, const size_t n, const float2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_ctrmm(layout, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld)); } void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t m, const size_t n, const double2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_ztrmm(layout, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld)); } void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t m, const size_t n, const half alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer); auto b_buffer_bis = HalfToFloatBuffer(b_buffer); cblasXtrmm(layout, side, triangle, a_transpose, diagonal, m, n, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, b_buffer_bis, b_offset, b_ld); FloatToHalfBuffer(b_buffer, b_buffer_bis); } // Forwards the Netlib BLAS calls for STRSM/DTRSM/CTRSM/ZTRSM void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t m, const size_t n, const float alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { cblas_strsm(layout, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), alpha, &a_buffer[a_offset], static_cast(a_ld), &b_buffer[b_offset], static_cast(b_ld)); } void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t m, const size_t n, const double alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { cblas_dtrsm(layout, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), alpha, &a_buffer[a_offset], static_cast(a_ld), &b_buffer[b_offset], static_cast(b_ld)); } void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t m, const size_t n, const float2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_ctrsm(layout, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld)); } void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, const size_t m, const size_t n, const double2 alpha, const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; cblas_ztrsm(layout, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), alpha_array.data(), reinterpret_cast(&a_buffer[a_offset]), static_cast(a_ld), reinterpret_cast(&b_buffer[b_offset]), static_cast(b_ld)); } // ================================================================================================= } // namespace clblast // CLBLAST_TEST_WRAPPER_CBLAS_H_ #endif CLBlast-1.6.3/test/wrapper_clblas.hpp000066400000000000000000005226051463263031500175120ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a wrapper around the clBLAS library, such that its routines can be called // in a similar way as the CLBlast routines: using alpha and beta to determine the precision. // // ================================================================================================= #ifndef CLBLAST_TEST_WRAPPER_CLBLAS_H_ #define CLBLAST_TEST_WRAPPER_CLBLAS_H_ #include #include "utilities/utilities.hpp" namespace clblast { // Conversions from CLBlast types clblasOrder convertToCLBLAS(const Layout v) { return (v == Layout::kRowMajor) ? clblasRowMajor : clblasColumnMajor; } clblasTranspose convertToCLBLAS(const Transpose v) { return (v == Transpose::kNo) ? clblasNoTrans : (v == Transpose::kYes) ? clblasTrans : clblasConjTrans; } clblasUplo convertToCLBLAS(const Triangle v) { return (v == Triangle::kUpper) ? clblasUpper : clblasLower; } clblasDiag convertToCLBLAS(const Diagonal v) { return (v == Diagonal::kUnit) ? clblasUnit : clblasNonUnit; } clblasSide convertToCLBLAS(const Side v) { return (v == Side::kLeft) ? clblasLeft : clblasRight; } // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // Forwards the clBLAS calls for SROTG/DROTG template clblasStatus clblasXrotg(Buffer& sa_buffer, const size_t sa_offset, Buffer& sb_buffer, const size_t sb_offset, Buffer& sc_buffer, const size_t sc_offset, Buffer& ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXrotg(Buffer& sa_buffer, const size_t sa_offset, Buffer& sb_buffer, const size_t sb_offset, Buffer& sc_buffer, const size_t sc_offset, Buffer& ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSrotg(sa_buffer(), sa_offset, sb_buffer(), sb_offset, sc_buffer(), sc_offset, ss_buffer(), ss_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXrotg(Buffer& sa_buffer, const size_t sa_offset, Buffer& sb_buffer, const size_t sb_offset, Buffer& sc_buffer, const size_t sc_offset, Buffer& ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDrotg(sa_buffer(), sa_offset, sb_buffer(), sb_offset, sc_buffer(), sc_offset, ss_buffer(), ss_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SROTMG/DROTMG template clblasStatus clblasXrotmg(Buffer& sd1_buffer, const size_t sd1_offset, Buffer& sd2_buffer, const size_t sd2_offset, Buffer& sx1_buffer, const size_t sx1_offset, const Buffer& sy1_buffer, const size_t sy1_offset, Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXrotmg(Buffer& sd1_buffer, const size_t sd1_offset, Buffer& sd2_buffer, const size_t sd2_offset, Buffer& sx1_buffer, const size_t sx1_offset, const Buffer& sy1_buffer, const size_t sy1_offset, Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSrotmg(sd1_buffer(), sd1_offset, sd2_buffer(), sd2_offset, sx1_buffer(), sx1_offset, sy1_buffer(), sy1_offset, sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXrotmg(Buffer& sd1_buffer, const size_t sd1_offset, Buffer& sd2_buffer, const size_t sd2_offset, Buffer& sx1_buffer, const size_t sx1_offset, const Buffer& sy1_buffer, const size_t sy1_offset, Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDrotmg(sd1_buffer(), sd1_offset, sd2_buffer(), sd2_offset, sx1_buffer(), sx1_offset, sy1_buffer(), sy1_offset, sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SROT/DROT clblasStatus clblasXrot(const size_t n, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, const float cos, const float sin, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSrot(n, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), cos, sin, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXrot(const size_t n, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, const double cos, const double sin, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDrot(n, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), cos, sin, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SROTM/DROTM template clblasStatus clblasXrotm(const size_t n, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXrotm(const size_t n, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSrotm(n, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXrotm(const size_t n, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDrotm(n, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP template clblasStatus clblasXswap(const size_t n, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXswap(const size_t n, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSswap(n, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap(const size_t n, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDswap(n, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap(const size_t n, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCswap(n, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap(const size_t n, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZswap(n, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap(const size_t n, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); auto status = clblasXswap(n, x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL clblasStatus clblasXscal(const size_t n, const float alpha, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSscal(n, alpha, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const double alpha, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDscal(n, alpha, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const float2 alpha, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCscal(n, cl_float2{{alpha.real(), alpha.imag()}}, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const double2 alpha, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZscal(n, cl_double2{{alpha.real(), alpha.imag()}}, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const half alpha, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto status = clblasXscal(n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY template clblasStatus clblasXcopy(const size_t n, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXcopy(const size_t n, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasScopy(n, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy(const size_t n, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDcopy(n, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy(const size_t n, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCcopy(n, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy(const size_t n, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZcopy(n, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy(const size_t n, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); auto status = clblasXcopy(n, x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY clblasStatus clblasXaxpy(const size_t n, const float alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSaxpy(n, alpha, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const double alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDaxpy(n, alpha, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const float2 alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCaxpy(n, cl_float2{{alpha.real(), alpha.imag()}}, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const double2 alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZaxpy(n, cl_double2{{alpha.real(), alpha.imag()}}, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const half alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); auto status = clblasXaxpy(n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for SDOT/DDOT template clblasStatus clblasXdot(const size_t n, Buffer& dot_buffer, const size_t dot_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXdot(const size_t n, Buffer& dot_buffer, const size_t dot_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasSdot(n, dot_buffer(), dot_offset, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdot(const size_t n, Buffer& dot_buffer, const size_t dot_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasDdot(n, dot_buffer(), dot_offset, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdot(const size_t n, Buffer& dot_buffer, const size_t dot_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); auto dot_buffer_bis = HalfToFloatBuffer(dot_buffer, queues[0]); auto status = clblasXdot(n, dot_buffer_bis, dot_offset, x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(dot_buffer, dot_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for CDOTU/ZDOTU template clblasStatus clblasXdotu(const size_t n, Buffer& dot_buffer, const size_t dot_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXdotu(const size_t n, Buffer& dot_buffer, const size_t dot_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasCdotu(n, dot_buffer(), dot_offset, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdotu(const size_t n, Buffer& dot_buffer, const size_t dot_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasZdotu(n, dot_buffer(), dot_offset, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for CDOTC/ZDOTC template clblasStatus clblasXdotc(const size_t n, Buffer& dot_buffer, const size_t dot_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXdotc(const size_t n, Buffer& dot_buffer, const size_t dot_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasCdotc(n, dot_buffer(), dot_offset, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdotc(const size_t n, Buffer& dot_buffer, const size_t dot_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasZdotc(n, dot_buffer(), dot_offset, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2 template clblasStatus clblasXnrm2(const size_t n, Buffer& nrm2_buffer, const size_t nrm2_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXnrm2(const size_t n, Buffer& nrm2_buffer, const size_t nrm2_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasSnrm2(n, nrm2_buffer(), nrm2_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2(const size_t n, Buffer& nrm2_buffer, const size_t nrm2_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasDnrm2(n, nrm2_buffer(), nrm2_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2(const size_t n, Buffer& nrm2_buffer, const size_t nrm2_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasScnrm2(n, nrm2_buffer(), nrm2_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2(const size_t n, Buffer& nrm2_buffer, const size_t nrm2_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasDznrm2(n, nrm2_buffer(), nrm2_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2(const size_t n, Buffer& nrm2_buffer, const size_t nrm2_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto nrm2_buffer_bis = HalfToFloatBuffer(nrm2_buffer, queues[0]); auto status = clblasXnrm2(n, nrm2_buffer_bis, nrm2_offset, x_buffer_bis, x_offset, x_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(nrm2_buffer, nrm2_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for SASUM/DASUM/ScASUM/DzASUM template clblasStatus clblasXasum(const size_t n, Buffer& asum_buffer, const size_t asum_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXasum(const size_t n, Buffer& asum_buffer, const size_t asum_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasSasum(n, asum_buffer(), asum_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum(const size_t n, Buffer& asum_buffer, const size_t asum_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasDasum(n, asum_buffer(), asum_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum(const size_t n, Buffer& asum_buffer, const size_t asum_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasScasum(n, asum_buffer(), asum_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum(const size_t n, Buffer& asum_buffer, const size_t asum_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasDzasum(n, asum_buffer(), asum_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum(const size_t n, Buffer& asum_buffer, const size_t asum_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto asum_buffer_bis = HalfToFloatBuffer(asum_buffer, queues[0]); auto status = clblasXasum(n, asum_buffer_bis, asum_offset, x_buffer_bis, x_offset, x_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(asum_buffer, asum_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template clblasStatus clblasXamax(const size_t n, Buffer& imax_buffer, const size_t imax_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXamax(const size_t n, Buffer& imax_buffer, const size_t imax_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasiSamax(n, imax_buffer(), imax_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax(const size_t n, Buffer& imax_buffer, const size_t imax_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasiDamax(n, imax_buffer(), imax_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax(const size_t n, Buffer& imax_buffer, const size_t imax_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasiCamax(n, imax_buffer(), imax_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax(const size_t n, Buffer& imax_buffer, const size_t imax_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasiZamax(n, imax_buffer(), imax_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax(const size_t n, Buffer& imax_buffer, const size_t imax_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto imax_buffer_bis = imax_buffer; auto status = clblasXamax(n, imax_buffer_bis, imax_offset, x_buffer_bis, x_offset, x_inc, num_queues, queues, num_wait_events, wait_events, events); return status; } // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // Forwards the clBLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const float alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSgemv(layout, a_transpose, m, n, alpha, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), beta, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const double alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDgemv(layout, a_transpose, m, n, alpha, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), beta, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const float2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgemv(layout, a_transpose, m, n, cl_float2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const double2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgemv(layout, a_transpose, m, n, cl_double2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const half alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); auto status = clblasXgemv(layout, a_transpose, m, n, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, x_buffer_bis, x_offset, x_inc, HalfToFloat(beta), y_buffer_bis, y_offset, y_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSgbmv(layout, a_transpose, m, n, kl, ku, alpha, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), beta, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDgbmv(layout, a_transpose, m, n, kl, ku, alpha, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), beta, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgbmv(layout, a_transpose, m, n, kl, ku, cl_float2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgbmv(layout, a_transpose, m, n, kl, ku, cl_double2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const half alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); auto status = clblasXgbmv(layout, a_transpose, m, n, kl, ku, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, x_buffer_bis, x_offset, x_inc, HalfToFloat(beta), y_buffer_bis, y_offset, y_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for CHEMV/ZHEMV clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChemv(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhemv(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for CHBMV/ZHBMV clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const float2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChbmv(layout, triangle, n, k, cl_float2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const double2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhbmv(layout, triangle, n, k, cl_double2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for CHPMV/ZHPMV clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, const Buffer& ap_buffer, const size_t ap_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChpmv(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, ap_buffer(), ap_offset, x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, const Buffer& ap_buffer, const size_t ap_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhpmv(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, ap_buffer(), ap_offset, x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SSYMV/DSYMV clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsymv(layout, triangle, n, alpha, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), beta, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsymv(layout, triangle, n, alpha, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), beta, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const half alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); auto status = clblasXsymv(layout, triangle, n, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, x_buffer_bis, x_offset, x_inc, HalfToFloat(beta), y_buffer_bis, y_offset, y_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for SSBMV/DSBMV clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const float alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsbmv(layout, triangle, n, k, alpha, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), beta, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const double alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsbmv(layout, triangle, n, k, alpha, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), beta, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const half alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); auto status = clblasXsbmv(layout, triangle, n, k, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, x_buffer_bis, x_offset, x_inc, HalfToFloat(beta), y_buffer_bis, y_offset, y_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for SSPMV/DSPMV clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, const Buffer& ap_buffer, const size_t ap_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSspmv(layout, triangle, n, alpha, ap_buffer(), ap_offset, x_buffer(), x_offset, static_cast(x_inc), beta, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, const Buffer& ap_buffer, const size_t ap_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDspmv(layout, triangle, n, alpha, ap_buffer(), ap_offset, x_buffer(), x_offset, static_cast(x_inc), beta, y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const half alpha, const Buffer& ap_buffer, const size_t ap_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); auto status = clblasXspmv(layout, triangle, n, HalfToFloat(alpha), ap_buffer_bis, ap_offset, x_buffer_bis, x_offset, x_inc, HalfToFloat(beta), y_buffer_bis, y_offset, y_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for STRMV/DTRMV/CTRMV/ZTRMV template clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasStrmv(layout, triangle, a_transpose, diagonal, n, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasDtrmv(layout, triangle, a_transpose, diagonal, n, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasCtrmv(layout, triangle, a_transpose, diagonal, n, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasZtrmv(layout, triangle, a_transpose, diagonal, n, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto status = clblasXtrmv(layout, triangle, a_transpose, diagonal, n, a_buffer_bis, a_offset, a_ld, x_buffer_bis, x_offset, x_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for STBMV/DTBMV/CTBMV/ZTBMV template clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasStbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasDtbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasCtbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasZtbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto status = clblasXtbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer_bis, a_offset, a_ld, x_buffer_bis, x_offset, x_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for STPMV/DTPMV/CTPMV/ZTPMV template clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& ap_buffer, const size_t ap_offset, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& ap_buffer, const size_t ap_offset, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasStpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer(), ap_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& ap_buffer, const size_t ap_offset, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasDtpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer(), ap_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& ap_buffer, const size_t ap_offset, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasCtpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer(), ap_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& ap_buffer, const size_t ap_offset, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasZtpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer(), ap_offset, x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& ap_buffer, const size_t ap_offset, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto status = clblasXtpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer_bis, ap_offset, x_buffer_bis, x_offset, x_inc, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for STRSV/DTRSV/CTRSV/ZTRSV template clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStrsv(layout, triangle, a_transpose, diagonal, n, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtrsv(layout, triangle, a_transpose, diagonal, n, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtrsv(layout, triangle, a_transpose, diagonal, n, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtrsv(layout, triangle, a_transpose, diagonal, n, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for STBSV/DTBSV/CTBSV/ZTBSV template clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStbsv(layout, triangle, a_transpose, diagonal, n, k, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtbsv(layout, triangle, a_transpose, diagonal, n, k, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtbsv(layout, triangle, a_transpose, diagonal, n, k, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtbsv(layout, triangle, a_transpose, diagonal, n, k, a_buffer(), a_offset, a_ld, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for STPSV/DTPSV/CTPSV/ZTPSV template clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& ap_buffer, const size_t ap_offset, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& ap_buffer, const size_t ap_offset, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStpsv(layout, triangle, a_transpose, diagonal, n, ap_buffer(), ap_offset, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& ap_buffer, const size_t ap_offset, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtpsv(layout, triangle, a_transpose, diagonal, n, ap_buffer(), ap_offset, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& ap_buffer, const size_t ap_offset, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtpsv(layout, triangle, a_transpose, diagonal, n, ap_buffer(), ap_offset, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const Buffer& ap_buffer, const size_t ap_offset, Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtpsv(layout, triangle, a_transpose, diagonal, n, ap_buffer(), ap_offset, x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SGER/DGER clblasStatus clblasXger(const clblasOrder layout, const size_t m, const size_t n, const float alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSger(layout, m, n, alpha, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXger(const clblasOrder layout, const size_t m, const size_t n, const double alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDger(layout, m, n, alpha, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXger(const clblasOrder layout, const size_t m, const size_t n, const half alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto status = clblasXger(layout, m, n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc, a_buffer_bis, a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for CGERU/ZGERU clblasStatus clblasXgeru(const clblasOrder layout, const size_t m, const size_t n, const float2 alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgeru(layout, m, n, cl_float2{{alpha.real(), alpha.imag()}}, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgeru(const clblasOrder layout, const size_t m, const size_t n, const double2 alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgeru(layout, m, n, cl_double2{{alpha.real(), alpha.imag()}}, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for CGERC/ZGERC clblasStatus clblasXgerc(const clblasOrder layout, const size_t m, const size_t n, const float2 alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgerc(layout, m, n, cl_float2{{alpha.real(), alpha.imag()}}, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgerc(const clblasOrder layout, const size_t m, const size_t n, const double2 alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgerc(layout, m, n, cl_double2{{alpha.real(), alpha.imag()}}, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for CHER/ZHER clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCher(layout, triangle, n, alpha, x_buffer(), x_offset, static_cast(x_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZher(layout, triangle, n, alpha, x_buffer(), x_offset, static_cast(x_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for CHPR/ZHPR clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChpr(layout, triangle, n, alpha, x_buffer(), x_offset, static_cast(x_inc), ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhpr(layout, triangle, n, alpha, x_buffer(), x_offset, static_cast(x_inc), ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for CHER2/ZHER2 clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCher2(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZher2(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for CHPR2/ZHPR2 clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChpr2(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhpr2(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SSYR/DSYR clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyr(layout, triangle, n, alpha, x_buffer(), x_offset, static_cast(x_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyr(layout, triangle, n, alpha, x_buffer(), x_offset, static_cast(x_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const half alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto status = clblasXsyr(layout, triangle, n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, a_buffer_bis, a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for SSPR/DSPR clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSspr(layout, triangle, n, alpha, x_buffer(), x_offset, static_cast(x_inc), ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDspr(layout, triangle, n, alpha, x_buffer(), x_offset, static_cast(x_inc), ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const half alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); auto status = clblasXspr(layout, triangle, n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, ap_buffer_bis, ap_offset, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(ap_buffer, ap_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for SSYR2/DSYR2 clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyr2(layout, triangle, n, alpha, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyr2(layout, triangle, n, alpha, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const half alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto status = clblasXsyr2(layout, triangle, n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc, a_buffer_bis, a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for SSPR2/DSPR2 clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSspr2(layout, triangle, n, alpha, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDspr2(layout, triangle, n, alpha, x_buffer(), x_offset, static_cast(x_inc), y_buffer(), y_offset, static_cast(y_inc), ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const half alpha, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); auto status = clblasXspr2(layout, triangle, n, HalfToFloat(alpha), x_buffer_bis, x_offset, x_inc, y_buffer_bis, y_offset, y_inc, ap_buffer_bis, ap_offset, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(ap_buffer, ap_buffer_bis, queues[0]); return status; } // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // Forwards the clBLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSgemm(layout, a_transpose, b_transpose, m, n, k, alpha, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, beta, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDgemm(layout, a_transpose, b_transpose, m, n, k, alpha, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, beta, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgemm(layout, a_transpose, b_transpose, m, n, k, cl_float2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgemm(layout, a_transpose, b_transpose, m, n, k, cl_double2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const half alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const half beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); auto status = clblasXgemm(layout, a_transpose, b_transpose, m, n, k, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, b_buffer_bis, b_offset, b_ld, HalfToFloat(beta), c_buffer_bis, c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const float alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsymm(layout, side, triangle, m, n, alpha, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, beta, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const double alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsymm(layout, side, triangle, m, n, alpha, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, beta, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const float2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCsymm(layout, side, triangle, m, n, cl_float2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const double2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZsymm(layout, side, triangle, m, n, cl_double2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const half alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const half beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); auto status = clblasXsymm(layout, side, triangle, m, n, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, b_buffer_bis, b_offset, b_ld, HalfToFloat(beta), c_buffer_bis, c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for CHEMM/ZHEMM clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const float2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChemm(layout, side, triangle, m, n, cl_float2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const double2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhemm(layout, side, triangle, m, n, cl_double2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const float alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const float beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyrk(layout, triangle, a_transpose, n, k, alpha, a_buffer(), a_offset, a_ld, beta, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const double alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const double beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyrk(layout, triangle, a_transpose, n, k, alpha, a_buffer(), a_offset, a_ld, beta, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const float2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const float2 beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCsyrk(layout, triangle, a_transpose, n, k, cl_float2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, cl_float2{{beta.real(), beta.imag()}}, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const double2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const double2 beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZsyrk(layout, triangle, a_transpose, n, k, cl_double2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, cl_double2{{beta.real(), beta.imag()}}, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const half alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const half beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); auto status = clblasXsyrk(layout, triangle, a_transpose, n, k, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, HalfToFloat(beta), c_buffer_bis, c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for CHERK/ZHERK clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const float alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const float beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCherk(layout, triangle, a_transpose, n, k, alpha, a_buffer(), a_offset, a_ld, beta, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const double alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const double beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZherk(layout, triangle, a_transpose, n, k, alpha, a_buffer(), a_offset, a_ld, beta, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const float alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyr2k(layout, triangle, ab_transpose, n, k, alpha, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, beta, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const double alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyr2k(layout, triangle, ab_transpose, n, k, alpha, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, beta, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const float2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCsyr2k(layout, triangle, ab_transpose, n, k, cl_float2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const double2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZsyr2k(layout, triangle, ab_transpose, n, k, cl_double2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const half alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const half beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); auto status = clblasXsyr2k(layout, triangle, ab_transpose, n, k, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, b_buffer_bis, b_offset, b_ld, HalfToFloat(beta), c_buffer_bis, c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for CHER2K/ZHER2K clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const float2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCher2k(layout, triangle, ab_transpose, n, k, cl_float2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, beta, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const double2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZher2k(layout, triangle, ab_transpose, n, k, cl_double2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, beta, c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for STRMM/DTRMM/CTRMM/ZTRMM clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtrmm(layout, side, triangle, a_transpose, diagonal, m, n, cl_float2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtrmm(layout, side, triangle, a_transpose, diagonal, m, n, cl_double2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const half alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); auto status = clblasXtrmm(layout, side, triangle, a_transpose, diagonal, m, n, HalfToFloat(alpha), a_buffer_bis, a_offset, a_ld, b_buffer_bis, b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]); return status; } // Forwards the clBLAS calls for STRSM/DTRSM/CTRSM/ZTRSM clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtrsm(layout, side, triangle, a_transpose, diagonal, m, n, cl_float2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double2 alpha, const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtrsm(layout, side, triangle, a_transpose, diagonal, m, n, cl_double2{{alpha.real(), alpha.imag()}}, a_buffer(), a_offset, a_ld, b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } // ================================================================================================= } // namespace clblast // CLBLAST_TEST_WRAPPER_CLBLAS_H_ #endif CLBlast-1.6.3/test/wrapper_cublas.hpp000066400000000000000000004370201463263031500175170ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a wrapper around the cuBLAS library, such that its routines can be called // in a similar way as the CLBlast routines: using alpha and beta to determine the precision. // // ================================================================================================= #ifndef CLBLAST_TEST_WRAPPER_CUBLAS_H_ #define CLBLAST_TEST_WRAPPER_CUBLAS_H_ #include #include #include "utilities/utilities.hpp" namespace clblast { // Conversions from CLBlast types cublasOperation_t convertToCUBLAS(const Transpose v) { return (v == Transpose::kNo) ? CUBLAS_OP_N : (v == Transpose::kYes) ? CUBLAS_OP_T : CUBLAS_OP_C; } cublasFillMode_t convertToCUBLAS(const Triangle v) { return (v == Triangle::kUpper) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; } cublasDiagType_t convertToCUBLAS(const Diagonal v) { return (v == Diagonal::kUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; } cublasSideMode_t convertToCUBLAS(const Side v) { return (v == Side::kLeft) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; } // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // Forwards the cuBLAS calls for SROTG/DROTG template cublasStatus_t cublasXrotg(cublasHandle_t handle, T* sa_buffer, const size_t sa_offset, T* sb_buffer, const size_t sb_offset, T* sc_buffer, const size_t sc_offset, T* ss_buffer, const size_t ss_offset); template <> cublasStatus_t cublasXrotg(cublasHandle_t handle, float* sa_buffer, const size_t sa_offset, float* sb_buffer, const size_t sb_offset, float* sc_buffer, const size_t sc_offset, float* ss_buffer, const size_t ss_offset) { auto status = cublasSrotg(handle, &sa_buffer[sa_offset], &sb_buffer[sb_offset], &sc_buffer[sc_offset], &ss_buffer[ss_offset]); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXrotg(cublasHandle_t handle, double* sa_buffer, const size_t sa_offset, double* sb_buffer, const size_t sb_offset, double* sc_buffer, const size_t sc_offset, double* ss_buffer, const size_t ss_offset) { auto status = cublasDrotg(handle, &sa_buffer[sa_offset], &sb_buffer[sb_offset], &sc_buffer[sc_offset], &ss_buffer[ss_offset]); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for SROTMG/DROTMG template cublasStatus_t cublasXrotmg(cublasHandle_t handle, T* sd1_buffer, const size_t sd1_offset, T* sd2_buffer, const size_t sd2_offset, T* sx1_buffer, const size_t sx1_offset, const T* sy1_buffer, const size_t sy1_offset, T* sparam_buffer, const size_t sparam_offset); template <> cublasStatus_t cublasXrotmg(cublasHandle_t handle, float* sd1_buffer, const size_t sd1_offset, float* sd2_buffer, const size_t sd2_offset, float* sx1_buffer, const size_t sx1_offset, const float* sy1_buffer, const size_t sy1_offset, float* sparam_buffer, const size_t sparam_offset) { auto status = cublasSrotmg(handle, &sd1_buffer[sd1_offset], &sd2_buffer[sd2_offset], &sx1_buffer[sx1_offset], &sy1_buffer[sy1_offset], &sparam_buffer[sparam_offset]); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXrotmg(cublasHandle_t handle, double* sd1_buffer, const size_t sd1_offset, double* sd2_buffer, const size_t sd2_offset, double* sx1_buffer, const size_t sx1_offset, const double* sy1_buffer, const size_t sy1_offset, double* sparam_buffer, const size_t sparam_offset) { auto status = cublasDrotmg(handle, &sd1_buffer[sd1_offset], &sd2_buffer[sd2_offset], &sx1_buffer[sx1_offset], &sy1_buffer[sy1_offset], &sparam_buffer[sparam_offset]); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for SROT/DROT cublasStatus_t cublasXrot(cublasHandle_t handle, const size_t n, float* x_buffer, const size_t x_offset, const size_t x_inc, float* y_buffer, const size_t y_offset, const size_t y_inc, const float cos, const float sin) { auto status = cublasSrot(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &cos, &sin); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXrot(cublasHandle_t handle, const size_t n, double* x_buffer, const size_t x_offset, const size_t x_inc, double* y_buffer, const size_t y_offset, const size_t y_inc, const double cos, const double sin) { auto status = cublasDrot(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &cos, &sin); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for SROTM/DROTM template cublasStatus_t cublasXrotm(cublasHandle_t handle, const size_t n, T* x_buffer, const size_t x_offset, const size_t x_inc, T* y_buffer, const size_t y_offset, const size_t y_inc, T* sparam_buffer, const size_t sparam_offset); template <> cublasStatus_t cublasXrotm(cublasHandle_t handle, const size_t n, float* x_buffer, const size_t x_offset, const size_t x_inc, float* y_buffer, const size_t y_offset, const size_t y_inc, float* sparam_buffer, const size_t sparam_offset) { auto status = cublasSrotm(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &sparam_buffer[sparam_offset]); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXrotm(cublasHandle_t handle, const size_t n, double* x_buffer, const size_t x_offset, const size_t x_inc, double* y_buffer, const size_t y_offset, const size_t y_inc, double* sparam_buffer, const size_t sparam_offset) { auto status = cublasDrotm(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &sparam_buffer[sparam_offset]); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP template cublasStatus_t cublasXswap(cublasHandle_t handle, const size_t n, T* x_buffer, const size_t x_offset, const size_t x_inc, T* y_buffer, const size_t y_offset, const size_t y_inc); template <> cublasStatus_t cublasXswap(cublasHandle_t handle, const size_t n, float* x_buffer, const size_t x_offset, const size_t x_inc, float* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasSswap(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXswap(cublasHandle_t handle, const size_t n, double* x_buffer, const size_t x_offset, const size_t x_inc, double* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasDswap(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXswap(cublasHandle_t handle, const size_t n, float2* x_buffer, const size_t x_offset, const size_t x_inc, float2* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasCswap(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXswap(cublasHandle_t handle, const size_t n, double2* x_buffer, const size_t x_offset, const size_t x_inc, double2* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasZswap(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXswap(cublasHandle_t handle, const size_t n, half* x_buffer, const size_t x_offset, const size_t x_inc, half* y_buffer, const size_t y_offset, const size_t y_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n, const float alpha, float* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasSscal(handle, static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n, const double alpha, double* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasDscal(handle, static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n, const float2 alpha, float2* x_buffer, const size_t x_offset, const size_t x_inc) { cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasCscal(handle, static_cast(n), &alpha_cuda, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n, const double2 alpha, double2* x_buffer, const size_t x_offset, const size_t x_inc) { cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasZscal(handle, static_cast(n), &alpha_cuda, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n, const half alpha, half* x_buffer, const size_t x_offset, const size_t x_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY template cublasStatus_t cublasXcopy(cublasHandle_t handle, const size_t n, const T* x_buffer, const size_t x_offset, const size_t x_inc, T* y_buffer, const size_t y_offset, const size_t y_inc); template <> cublasStatus_t cublasXcopy(cublasHandle_t handle, const size_t n, const float* x_buffer, const size_t x_offset, const size_t x_inc, float* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasScopy(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXcopy(cublasHandle_t handle, const size_t n, const double* x_buffer, const size_t x_offset, const size_t x_inc, double* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasDcopy(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXcopy(cublasHandle_t handle, const size_t n, const float2* x_buffer, const size_t x_offset, const size_t x_inc, float2* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasCcopy(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXcopy(cublasHandle_t handle, const size_t n, const double2* x_buffer, const size_t x_offset, const size_t x_inc, double2* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasZcopy(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXcopy(cublasHandle_t handle, const size_t n, const half* x_buffer, const size_t x_offset, const size_t x_inc, half* y_buffer, const size_t y_offset, const size_t y_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n, const float alpha, const float* x_buffer, const size_t x_offset, const size_t x_inc, float* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasSaxpy(handle, static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n, const double alpha, const double* x_buffer, const size_t x_offset, const size_t x_inc, double* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasDaxpy(handle, static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n, const float2 alpha, const float2* x_buffer, const size_t x_offset, const size_t x_inc, float2* y_buffer, const size_t y_offset, const size_t y_inc) { cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasCaxpy(handle, static_cast(n), &alpha_cuda, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n, const double2 alpha, const double2* x_buffer, const size_t x_offset, const size_t x_inc, double2* y_buffer, const size_t y_offset, const size_t y_inc) { cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasZaxpy(handle, static_cast(n), &alpha_cuda, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n, const half alpha, const half* x_buffer, const size_t x_offset, const size_t x_inc, half* y_buffer, const size_t y_offset, const size_t y_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for SDOT/DDOT template cublasStatus_t cublasXdot(cublasHandle_t handle, const size_t n, T* dot_buffer, const size_t dot_offset, const T* x_buffer, const size_t x_offset, const size_t x_inc, const T* y_buffer, const size_t y_offset, const size_t y_inc); template <> cublasStatus_t cublasXdot(cublasHandle_t handle, const size_t n, float* dot_buffer, const size_t dot_offset, const float* x_buffer, const size_t x_offset, const size_t x_inc, const float* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasSdot(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &dot_buffer[dot_offset]); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXdot(cublasHandle_t handle, const size_t n, double* dot_buffer, const size_t dot_offset, const double* x_buffer, const size_t x_offset, const size_t x_inc, const double* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasDdot(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &dot_buffer[dot_offset]); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXdot(cublasHandle_t handle, const size_t n, half* dot_buffer, const size_t dot_offset, const half* x_buffer, const size_t x_offset, const size_t x_inc, const half* y_buffer, const size_t y_offset, const size_t y_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for CDOTU/ZDOTU template cublasStatus_t cublasXdotu(cublasHandle_t handle, const size_t n, T* dot_buffer, const size_t dot_offset, const T* x_buffer, const size_t x_offset, const size_t x_inc, const T* y_buffer, const size_t y_offset, const size_t y_inc); template <> cublasStatus_t cublasXdotu(cublasHandle_t handle, const size_t n, float2* dot_buffer, const size_t dot_offset, const float2* x_buffer, const size_t x_offset, const size_t x_inc, const float2* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasCdotu(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&dot_buffer[dot_offset])); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXdotu(cublasHandle_t handle, const size_t n, double2* dot_buffer, const size_t dot_offset, const double2* x_buffer, const size_t x_offset, const size_t x_inc, const double2* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasZdotu(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&dot_buffer[dot_offset])); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for CDOTC/ZDOTC template cublasStatus_t cublasXdotc(cublasHandle_t handle, const size_t n, T* dot_buffer, const size_t dot_offset, const T* x_buffer, const size_t x_offset, const size_t x_inc, const T* y_buffer, const size_t y_offset, const size_t y_inc); template <> cublasStatus_t cublasXdotc(cublasHandle_t handle, const size_t n, float2* dot_buffer, const size_t dot_offset, const float2* x_buffer, const size_t x_offset, const size_t x_inc, const float2* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasCdotc(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&dot_buffer[dot_offset])); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXdotc(cublasHandle_t handle, const size_t n, double2* dot_buffer, const size_t dot_offset, const double2* x_buffer, const size_t x_offset, const size_t x_inc, const double2* y_buffer, const size_t y_offset, const size_t y_inc) { auto status = cublasZdotc(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&dot_buffer[dot_offset])); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2 template cublasStatus_t cublasXnrm2(cublasHandle_t handle, const size_t n, T* nrm2_buffer, const size_t nrm2_offset, const T* x_buffer, const size_t x_offset, const size_t x_inc); template <> cublasStatus_t cublasXnrm2(cublasHandle_t handle, const size_t n, float* nrm2_buffer, const size_t nrm2_offset, const float* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasSnrm2(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &nrm2_buffer[nrm2_offset]); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXnrm2(cublasHandle_t handle, const size_t n, double* nrm2_buffer, const size_t nrm2_offset, const double* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasDnrm2(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &nrm2_buffer[nrm2_offset]); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXnrm2(cublasHandle_t handle, const size_t n, float2* nrm2_buffer, const size_t nrm2_offset, const float2* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasScnrm2(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&nrm2_buffer[nrm2_offset])); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXnrm2(cublasHandle_t handle, const size_t n, double2* nrm2_buffer, const size_t nrm2_offset, const double2* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasDznrm2(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&nrm2_buffer[nrm2_offset])); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXnrm2(cublasHandle_t handle, const size_t n, half* nrm2_buffer, const size_t nrm2_offset, const half* x_buffer, const size_t x_offset, const size_t x_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for SASUM/DASUM/ScASUM/DzASUM template cublasStatus_t cublasXasum(cublasHandle_t handle, const size_t n, T* asum_buffer, const size_t asum_offset, const T* x_buffer, const size_t x_offset, const size_t x_inc); template <> cublasStatus_t cublasXasum(cublasHandle_t handle, const size_t n, float* asum_buffer, const size_t asum_offset, const float* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasSasum(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &asum_buffer[asum_offset]); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXasum(cublasHandle_t handle, const size_t n, double* asum_buffer, const size_t asum_offset, const double* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasDasum(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), &asum_buffer[asum_offset]); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXasum(cublasHandle_t handle, const size_t n, float2* asum_buffer, const size_t asum_offset, const float2* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasScasum(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&asum_buffer[asum_offset])); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXasum(cublasHandle_t handle, const size_t n, double2* asum_buffer, const size_t asum_offset, const double2* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasDzasum(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&asum_buffer[asum_offset])); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXasum(cublasHandle_t handle, const size_t n, half* asum_buffer, const size_t asum_offset, const half* x_buffer, const size_t x_offset, const size_t x_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, unsigned int * imax_buffer, const size_t imax_offset, const T* x_buffer, const size_t x_offset, const size_t x_inc); template <> cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, unsigned int * imax_buffer, const size_t imax_offset, const float* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasIsamax(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), reinterpret_cast(&imax_buffer[imax_offset])); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, unsigned int * imax_buffer, const size_t imax_offset, const double* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasIdamax(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), reinterpret_cast(&imax_buffer[imax_offset])); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, unsigned int * imax_buffer, const size_t imax_offset, const float2* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasIcamax(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&imax_buffer[imax_offset])); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, unsigned int * imax_buffer, const size_t imax_offset, const double2* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasIzamax(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&imax_buffer[imax_offset])); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, unsigned int * imax_buffer, const size_t imax_offset, const half* x_buffer, const size_t x_offset, const size_t x_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // Forwards the cuBLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const size_t m, const size_t n, const float alpha, const float* a_buffer, const size_t a_offset, const size_t a_ld, const float* x_buffer, const size_t x_offset, const size_t x_inc, const float beta, float* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSgemv(handle, a_transpose, static_cast(m), static_cast(n), &alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc), &beta, &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const size_t m, const size_t n, const double alpha, const double* a_buffer, const size_t a_offset, const size_t a_ld, const double* x_buffer, const size_t x_offset, const size_t x_inc, const double beta, double* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDgemv(handle, a_transpose, static_cast(m), static_cast(n), &alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc), &beta, &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const size_t m, const size_t n, const float2 alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, const float2* x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, float2* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasCgemv(handle, a_transpose, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), &beta_cuda, reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const size_t m, const size_t n, const double2 alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, const double2* x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, double2* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuDoubleComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasZgemv(handle, a_transpose, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), &beta_cuda, reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const size_t m, const size_t n, const half alpha, const half* a_buffer, const size_t a_offset, const size_t a_ld, const half* x_buffer, const size_t x_offset, const size_t x_inc, const half beta, half* y_buffer, const size_t y_offset, const size_t y_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float alpha, const float* a_buffer, const size_t a_offset, const size_t a_ld, const float* x_buffer, const size_t x_offset, const size_t x_inc, const float beta, float* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSgbmv(handle, a_transpose, static_cast(m), static_cast(n), static_cast(kl), static_cast(ku), &alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc), &beta, &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double alpha, const double* a_buffer, const size_t a_offset, const size_t a_ld, const double* x_buffer, const size_t x_offset, const size_t x_inc, const double beta, double* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDgbmv(handle, a_transpose, static_cast(m), static_cast(n), static_cast(kl), static_cast(ku), &alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc), &beta, &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float2 alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, const float2* x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, float2* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasCgbmv(handle, a_transpose, static_cast(m), static_cast(n), static_cast(kl), static_cast(ku), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), &beta_cuda, reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double2 alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, const double2* x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, double2* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuDoubleComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasZgbmv(handle, a_transpose, static_cast(m), static_cast(n), static_cast(kl), static_cast(ku), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), &beta_cuda, reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const half alpha, const half* a_buffer, const size_t a_offset, const size_t a_ld, const half* x_buffer, const size_t x_offset, const size_t x_inc, const half beta, half* y_buffer, const size_t y_offset, const size_t y_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for CHEMV/ZHEMV cublasStatus_t cublasXhemv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const float2 alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, const float2* x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, float2* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasChemv(handle, triangle, static_cast(n), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), &beta_cuda, reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXhemv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const double2 alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, const double2* x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, double2* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuDoubleComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasZhemv(handle, triangle, static_cast(n), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), &beta_cuda, reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for CHBMV/ZHBMV cublasStatus_t cublasXhbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const size_t k, const float2 alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, const float2* x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, float2* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasChbmv(handle, triangle, static_cast(n), static_cast(k), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), &beta_cuda, reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXhbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const size_t k, const double2 alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, const double2* x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, double2* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuDoubleComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasZhbmv(handle, triangle, static_cast(n), static_cast(k), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), &beta_cuda, reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for CHPMV/ZHPMV cublasStatus_t cublasXhpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const float2 alpha, const float2* ap_buffer, const size_t ap_offset, const float2* x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, float2* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasChpmv(handle, triangle, static_cast(n), &alpha_cuda, reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), &beta_cuda, reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXhpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const double2 alpha, const double2* ap_buffer, const size_t ap_offset, const double2* x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, double2* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuDoubleComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasZhpmv(handle, triangle, static_cast(n), &alpha_cuda, reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), &beta_cuda, reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for SSYMV/DSYMV cublasStatus_t cublasXsymv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const float alpha, const float* a_buffer, const size_t a_offset, const size_t a_ld, const float* x_buffer, const size_t x_offset, const size_t x_inc, const float beta, float* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSsymv(handle, triangle, static_cast(n), &alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc), &beta, &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsymv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const double alpha, const double* a_buffer, const size_t a_offset, const size_t a_ld, const double* x_buffer, const size_t x_offset, const size_t x_inc, const double beta, double* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDsymv(handle, triangle, static_cast(n), &alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc), &beta, &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsymv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const half alpha, const half* a_buffer, const size_t a_offset, const size_t a_ld, const half* x_buffer, const size_t x_offset, const size_t x_inc, const half beta, half* y_buffer, const size_t y_offset, const size_t y_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for SSBMV/DSBMV cublasStatus_t cublasXsbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const size_t k, const float alpha, const float* a_buffer, const size_t a_offset, const size_t a_ld, const float* x_buffer, const size_t x_offset, const size_t x_inc, const float beta, float* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSsbmv(handle, triangle, static_cast(n), static_cast(k), &alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc), &beta, &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const size_t k, const double alpha, const double* a_buffer, const size_t a_offset, const size_t a_ld, const double* x_buffer, const size_t x_offset, const size_t x_inc, const double beta, double* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDsbmv(handle, triangle, static_cast(n), static_cast(k), &alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc), &beta, &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const size_t k, const half alpha, const half* a_buffer, const size_t a_offset, const size_t a_ld, const half* x_buffer, const size_t x_offset, const size_t x_inc, const half beta, half* y_buffer, const size_t y_offset, const size_t y_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for SSPMV/DSPMV cublasStatus_t cublasXspmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const float alpha, const float* ap_buffer, const size_t ap_offset, const float* x_buffer, const size_t x_offset, const size_t x_inc, const float beta, float* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSspmv(handle, triangle, static_cast(n), &alpha, &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast(x_inc), &beta, &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXspmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const double alpha, const double* ap_buffer, const size_t ap_offset, const double* x_buffer, const size_t x_offset, const size_t x_inc, const double beta, double* y_buffer, const size_t y_offset, const size_t y_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDspmv(handle, triangle, static_cast(n), &alpha, &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast(x_inc), &beta, &y_buffer[y_offset], static_cast(y_inc)); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXspmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const half alpha, const half* ap_buffer, const size_t ap_offset, const half* x_buffer, const size_t x_offset, const size_t x_inc, const half beta, half* y_buffer, const size_t y_offset, const size_t y_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for STRMV/DTRMV/CTRMV/ZTRMV template cublasStatus_t cublasXtrmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const T* a_buffer, const size_t a_offset, const size_t a_ld, T* x_buffer, const size_t x_offset, const size_t x_inc); template <> cublasStatus_t cublasXtrmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const float* a_buffer, const size_t a_offset, const size_t a_ld, float* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasStrmv(handle, triangle, a_transpose, diagonal, static_cast(n), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtrmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const double* a_buffer, const size_t a_offset, const size_t a_ld, double* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDtrmv(handle, triangle, a_transpose, diagonal, static_cast(n), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtrmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const float2* a_buffer, const size_t a_offset, const size_t a_ld, float2* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasCtrmv(handle, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtrmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const double2* a_buffer, const size_t a_offset, const size_t a_ld, double2* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasZtrmv(handle, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtrmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const half* a_buffer, const size_t a_offset, const size_t a_ld, half* x_buffer, const size_t x_offset, const size_t x_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for STBMV/DTBMV/CTBMV/ZTBMV template cublasStatus_t cublasXtbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const size_t k, const T* a_buffer, const size_t a_offset, const size_t a_ld, T* x_buffer, const size_t x_offset, const size_t x_inc); template <> cublasStatus_t cublasXtbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const size_t k, const float* a_buffer, const size_t a_offset, const size_t a_ld, float* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasStbmv(handle, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const size_t k, const double* a_buffer, const size_t a_offset, const size_t a_ld, double* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDtbmv(handle, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const size_t k, const float2* a_buffer, const size_t a_offset, const size_t a_ld, float2* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasCtbmv(handle, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const size_t k, const double2* a_buffer, const size_t a_offset, const size_t a_ld, double2* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasZtbmv(handle, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const size_t k, const half* a_buffer, const size_t a_offset, const size_t a_ld, half* x_buffer, const size_t x_offset, const size_t x_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for STPMV/DTPMV/CTPMV/ZTPMV template cublasStatus_t cublasXtpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const T* ap_buffer, const size_t ap_offset, T* x_buffer, const size_t x_offset, const size_t x_inc); template <> cublasStatus_t cublasXtpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const float* ap_buffer, const size_t ap_offset, float* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasStpmv(handle, triangle, a_transpose, diagonal, static_cast(n), &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const double* ap_buffer, const size_t ap_offset, double* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDtpmv(handle, triangle, a_transpose, diagonal, static_cast(n), &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const float2* ap_buffer, const size_t ap_offset, float2* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasCtpmv(handle, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const double2* ap_buffer, const size_t ap_offset, double2* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasZtpmv(handle, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const half* ap_buffer, const size_t ap_offset, half* x_buffer, const size_t x_offset, const size_t x_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for STRSV/DTRSV/CTRSV/ZTRSV template cublasStatus_t cublasXtrsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const T* a_buffer, const size_t a_offset, const size_t a_ld, T* x_buffer, const size_t x_offset, const size_t x_inc); template <> cublasStatus_t cublasXtrsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const float* a_buffer, const size_t a_offset, const size_t a_ld, float* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasStrsv(handle, triangle, a_transpose, diagonal, static_cast(n), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtrsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const double* a_buffer, const size_t a_offset, const size_t a_ld, double* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDtrsv(handle, triangle, a_transpose, diagonal, static_cast(n), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtrsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const float2* a_buffer, const size_t a_offset, const size_t a_ld, float2* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasCtrsv(handle, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtrsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const double2* a_buffer, const size_t a_offset, const size_t a_ld, double2* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasZtrsv(handle, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for STBSV/DTBSV/CTBSV/ZTBSV template cublasStatus_t cublasXtbsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const size_t k, const T* a_buffer, const size_t a_offset, const size_t a_ld, T* x_buffer, const size_t x_offset, const size_t x_inc); template <> cublasStatus_t cublasXtbsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const size_t k, const float* a_buffer, const size_t a_offset, const size_t a_ld, float* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasStbsv(handle, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtbsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const size_t k, const double* a_buffer, const size_t a_offset, const size_t a_ld, double* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDtbsv(handle, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtbsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const size_t k, const float2* a_buffer, const size_t a_offset, const size_t a_ld, float2* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasCtbsv(handle, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtbsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const size_t k, const double2* a_buffer, const size_t a_offset, const size_t a_ld, double2* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasZtbsv(handle, triangle, a_transpose, diagonal, static_cast(n), static_cast(k), reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for STPSV/DTPSV/CTPSV/ZTPSV template cublasStatus_t cublasXtpsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const T* ap_buffer, const size_t ap_offset, T* x_buffer, const size_t x_offset, const size_t x_inc); template <> cublasStatus_t cublasXtpsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const float* ap_buffer, const size_t ap_offset, float* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasStpsv(handle, triangle, a_transpose, diagonal, static_cast(n), &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtpsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const double* ap_buffer, const size_t ap_offset, double* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDtpsv(handle, triangle, a_transpose, diagonal, static_cast(n), &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtpsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const float2* ap_buffer, const size_t ap_offset, float2* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasCtpsv(handle, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } template <> cublasStatus_t cublasXtpsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t n, const double2* ap_buffer, const size_t ap_offset, double2* x_buffer, const size_t x_offset, const size_t x_inc) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasZtpsv(handle, triangle, a_transpose, diagonal, static_cast(n), reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for SGER/DGER cublasStatus_t cublasXger(cublasHandle_t handle, const Layout layout, const size_t m, const size_t n, const float alpha, const float* x_buffer, const size_t x_offset, const size_t x_inc, const float* y_buffer, const size_t y_offset, const size_t y_inc, float* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSger(handle, static_cast(m), static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &a_buffer[a_offset], a_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXger(cublasHandle_t handle, const Layout layout, const size_t m, const size_t n, const double alpha, const double* x_buffer, const size_t x_offset, const size_t x_inc, const double* y_buffer, const size_t y_offset, const size_t y_inc, double* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDger(handle, static_cast(m), static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &a_buffer[a_offset], a_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXger(cublasHandle_t handle, const Layout layout, const size_t m, const size_t n, const half alpha, const half* x_buffer, const size_t x_offset, const size_t x_inc, const half* y_buffer, const size_t y_offset, const size_t y_inc, half* a_buffer, const size_t a_offset, const size_t a_ld) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for CGERU/ZGERU cublasStatus_t cublasXgeru(cublasHandle_t handle, const Layout layout, const size_t m, const size_t n, const float2 alpha, const float2* x_buffer, const size_t x_offset, const size_t x_inc, const float2* y_buffer, const size_t y_offset, const size_t y_inc, float2* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasCgeru(handle, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&a_buffer[a_offset]), a_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgeru(cublasHandle_t handle, const Layout layout, const size_t m, const size_t n, const double2 alpha, const double2* x_buffer, const size_t x_offset, const size_t x_inc, const double2* y_buffer, const size_t y_offset, const size_t y_inc, double2* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasZgeru(handle, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&a_buffer[a_offset]), a_ld); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for CGERC/ZGERC cublasStatus_t cublasXgerc(cublasHandle_t handle, const Layout layout, const size_t m, const size_t n, const float2 alpha, const float2* x_buffer, const size_t x_offset, const size_t x_inc, const float2* y_buffer, const size_t y_offset, const size_t y_inc, float2* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasCgerc(handle, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&a_buffer[a_offset]), a_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgerc(cublasHandle_t handle, const Layout layout, const size_t m, const size_t n, const double2 alpha, const double2* x_buffer, const size_t x_offset, const size_t x_inc, const double2* y_buffer, const size_t y_offset, const size_t y_inc, double2* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasZgerc(handle, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&a_buffer[a_offset]), a_ld); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for CHER/ZHER cublasStatus_t cublasXher(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const float alpha, const float2* x_buffer, const size_t x_offset, const size_t x_inc, float2* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasCher(handle, triangle, static_cast(n), &alpha, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&a_buffer[a_offset]), a_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXher(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const double alpha, const double2* x_buffer, const size_t x_offset, const size_t x_inc, double2* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasZher(handle, triangle, static_cast(n), &alpha, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&a_buffer[a_offset]), a_ld); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for CHPR/ZHPR cublasStatus_t cublasXhpr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const float alpha, const float2* x_buffer, const size_t x_offset, const size_t x_inc, float2* ap_buffer, const size_t ap_offset) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasChpr(handle, triangle, static_cast(n), &alpha, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&ap_buffer[ap_offset])); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXhpr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const double alpha, const double2* x_buffer, const size_t x_offset, const size_t x_inc, double2* ap_buffer, const size_t ap_offset) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasZhpr(handle, triangle, static_cast(n), &alpha, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&ap_buffer[ap_offset])); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for CHER2/ZHER2 cublasStatus_t cublasXher2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const float2 alpha, const float2* x_buffer, const size_t x_offset, const size_t x_inc, const float2* y_buffer, const size_t y_offset, const size_t y_inc, float2* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasCher2(handle, triangle, static_cast(n), &alpha_cuda, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&a_buffer[a_offset]), a_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXher2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const double2 alpha, const double2* x_buffer, const size_t x_offset, const size_t x_inc, const double2* y_buffer, const size_t y_offset, const size_t y_inc, double2* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasZher2(handle, triangle, static_cast(n), &alpha_cuda, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&a_buffer[a_offset]), a_ld); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for CHPR2/ZHPR2 cublasStatus_t cublasXhpr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const float2 alpha, const float2* x_buffer, const size_t x_offset, const size_t x_inc, const float2* y_buffer, const size_t y_offset, const size_t y_inc, float2* ap_buffer, const size_t ap_offset) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasChpr2(handle, triangle, static_cast(n), &alpha_cuda, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&ap_buffer[ap_offset])); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXhpr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const double2 alpha, const double2* x_buffer, const size_t x_offset, const size_t x_inc, const double2* y_buffer, const size_t y_offset, const size_t y_inc, double2* ap_buffer, const size_t ap_offset) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasZhpr2(handle, triangle, static_cast(n), &alpha_cuda, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), reinterpret_cast(&ap_buffer[ap_offset])); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for SSYR/DSYR cublasStatus_t cublasXsyr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const float alpha, const float* x_buffer, const size_t x_offset, const size_t x_inc, float* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSsyr(handle, triangle, static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc), &a_buffer[a_offset], a_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsyr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const double alpha, const double* x_buffer, const size_t x_offset, const size_t x_inc, double* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDsyr(handle, triangle, static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc), &a_buffer[a_offset], a_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsyr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const half alpha, const half* x_buffer, const size_t x_offset, const size_t x_inc, half* a_buffer, const size_t a_offset, const size_t a_ld) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for SSPR/DSPR cublasStatus_t cublasXspr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const float alpha, const float* x_buffer, const size_t x_offset, const size_t x_inc, float* ap_buffer, const size_t ap_offset) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSspr(handle, triangle, static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc), &ap_buffer[ap_offset]); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXspr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const double alpha, const double* x_buffer, const size_t x_offset, const size_t x_inc, double* ap_buffer, const size_t ap_offset) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDspr(handle, triangle, static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc), &ap_buffer[ap_offset]); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXspr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const half alpha, const half* x_buffer, const size_t x_offset, const size_t x_inc, half* ap_buffer, const size_t ap_offset) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for SSYR2/DSYR2 cublasStatus_t cublasXsyr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const float alpha, const float* x_buffer, const size_t x_offset, const size_t x_inc, const float* y_buffer, const size_t y_offset, const size_t y_inc, float* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSsyr2(handle, triangle, static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &a_buffer[a_offset], a_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsyr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const double alpha, const double* x_buffer, const size_t x_offset, const size_t x_inc, const double* y_buffer, const size_t y_offset, const size_t y_inc, double* a_buffer, const size_t a_offset, const size_t a_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDsyr2(handle, triangle, static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &a_buffer[a_offset], a_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsyr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const half alpha, const half* x_buffer, const size_t x_offset, const size_t x_inc, const half* y_buffer, const size_t y_offset, const size_t y_inc, half* a_buffer, const size_t a_offset, const size_t a_ld) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for SSPR2/DSPR2 cublasStatus_t cublasXspr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const float alpha, const float* x_buffer, const size_t x_offset, const size_t x_inc, const float* y_buffer, const size_t y_offset, const size_t y_inc, float* ap_buffer, const size_t ap_offset) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSspr2(handle, triangle, static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &ap_buffer[ap_offset]); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXspr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const double alpha, const double* x_buffer, const size_t x_offset, const size_t x_inc, const double* y_buffer, const size_t y_offset, const size_t y_inc, double* ap_buffer, const size_t ap_offset) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDspr2(handle, triangle, static_cast(n), &alpha, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc), &ap_buffer[ap_offset]); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXspr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const size_t n, const half alpha, const half* x_buffer, const size_t x_offset, const size_t x_inc, const half* y_buffer, const size_t y_offset, const size_t y_inc, half* ap_buffer, const size_t ap_offset) { return CUBLAS_STATUS_NOT_SUPPORTED; } // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // Forwards the cuBLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const float* a_buffer, const size_t a_offset, const size_t a_ld, const float* b_buffer, const size_t b_offset, const size_t b_ld, const float beta, float* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSgemm(handle, a_transpose, b_transpose, static_cast(m), static_cast(n), static_cast(k), &alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld, &beta, &c_buffer[c_offset], c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const double* a_buffer, const size_t a_offset, const size_t a_ld, const double* b_buffer, const size_t b_offset, const size_t b_ld, const double beta, double* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDgemm(handle, a_transpose, b_transpose, static_cast(m), static_cast(n), static_cast(k), &alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld, &beta, &c_buffer[c_offset], c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose, const size_t m, const size_t n, const size_t k, const float2 alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, const float2* b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, float2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasCgemm(handle, a_transpose, b_transpose, static_cast(m), static_cast(n), static_cast(k), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld, &beta_cuda, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose, const size_t m, const size_t n, const size_t k, const double2 alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, const double2* b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, double2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuDoubleComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasZgemm(handle, a_transpose, b_transpose, static_cast(m), static_cast(n), static_cast(k), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld, &beta_cuda, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose, const size_t m, const size_t n, const size_t k, const half alpha, const half* a_buffer, const size_t a_offset, const size_t a_ld, const half* b_buffer, const size_t b_offset, const size_t b_ld, const half beta, half* c_buffer, const size_t c_offset, const size_t c_ld) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const size_t m, const size_t n, const float alpha, const float* a_buffer, const size_t a_offset, const size_t a_ld, const float* b_buffer, const size_t b_offset, const size_t b_ld, const float beta, float* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSsymm(handle, side, triangle, static_cast(m), static_cast(n), &alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld, &beta, &c_buffer[c_offset], c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const size_t m, const size_t n, const double alpha, const double* a_buffer, const size_t a_offset, const size_t a_ld, const double* b_buffer, const size_t b_offset, const size_t b_ld, const double beta, double* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDsymm(handle, side, triangle, static_cast(m), static_cast(n), &alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld, &beta, &c_buffer[c_offset], c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const size_t m, const size_t n, const float2 alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, const float2* b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, float2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasCsymm(handle, side, triangle, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld, &beta_cuda, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const size_t m, const size_t n, const double2 alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, const double2* b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, double2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuDoubleComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasZsymm(handle, side, triangle, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld, &beta_cuda, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const size_t m, const size_t n, const half alpha, const half* a_buffer, const size_t a_offset, const size_t a_ld, const half* b_buffer, const size_t b_offset, const size_t b_ld, const half beta, half* c_buffer, const size_t c_offset, const size_t c_ld) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for CHEMM/ZHEMM cublasStatus_t cublasXhemm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const size_t m, const size_t n, const float2 alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, const float2* b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, float2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasChemm(handle, side, triangle, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld, &beta_cuda, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXhemm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const size_t m, const size_t n, const double2 alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, const double2* b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, double2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuDoubleComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasZhemm(handle, side, triangle, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld, &beta_cuda, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const size_t n, const size_t k, const float alpha, const float* a_buffer, const size_t a_offset, const size_t a_ld, const float beta, float* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSsyrk(handle, triangle, a_transpose, static_cast(n), static_cast(k), &alpha, &a_buffer[a_offset], a_ld, &beta, &c_buffer[c_offset], c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const size_t n, const size_t k, const double alpha, const double* a_buffer, const size_t a_offset, const size_t a_ld, const double beta, double* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDsyrk(handle, triangle, a_transpose, static_cast(n), static_cast(k), &alpha, &a_buffer[a_offset], a_ld, &beta, &c_buffer[c_offset], c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const size_t n, const size_t k, const float2 alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, const float2 beta, float2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasCsyrk(handle, triangle, a_transpose, static_cast(n), static_cast(k), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, &beta_cuda, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const size_t n, const size_t k, const double2 alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, const double2 beta, double2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuDoubleComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasZsyrk(handle, triangle, a_transpose, static_cast(n), static_cast(k), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, &beta_cuda, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const size_t n, const size_t k, const half alpha, const half* a_buffer, const size_t a_offset, const size_t a_ld, const half beta, half* c_buffer, const size_t c_offset, const size_t c_ld) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for CHERK/ZHERK cublasStatus_t cublasXherk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const size_t n, const size_t k, const float alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, const float beta, float2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasCherk(handle, triangle, a_transpose, static_cast(n), static_cast(k), &alpha, reinterpret_cast(&a_buffer[a_offset]), a_ld, &beta, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXherk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const size_t n, const size_t k, const double alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, const double beta, double2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasZherk(handle, triangle, a_transpose, static_cast(n), static_cast(k), &alpha, reinterpret_cast(&a_buffer[a_offset]), a_ld, &beta, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, const size_t n, const size_t k, const float alpha, const float* a_buffer, const size_t a_offset, const size_t a_ld, const float* b_buffer, const size_t b_offset, const size_t b_ld, const float beta, float* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasSsyr2k(handle, triangle, ab_transpose, static_cast(n), static_cast(k), &alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld, &beta, &c_buffer[c_offset], c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, const size_t n, const size_t k, const double alpha, const double* a_buffer, const size_t a_offset, const size_t a_ld, const double* b_buffer, const size_t b_offset, const size_t b_ld, const double beta, double* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDsyr2k(handle, triangle, ab_transpose, static_cast(n), static_cast(k), &alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld, &beta, &c_buffer[c_offset], c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, const size_t n, const size_t k, const float2 alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, const float2* b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, float2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasCsyr2k(handle, triangle, ab_transpose, static_cast(n), static_cast(k), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld, &beta_cuda, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, const size_t n, const size_t k, const double2 alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, const double2* b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, double2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); cuDoubleComplex beta_cuda; beta_cuda.x = beta.real(); beta_cuda.y = beta.imag(); auto status = cublasZsyr2k(handle, triangle, ab_transpose, static_cast(n), static_cast(k), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld, &beta_cuda, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, const size_t n, const size_t k, const half alpha, const half* a_buffer, const size_t a_offset, const size_t a_ld, const half* b_buffer, const size_t b_offset, const size_t b_ld, const half beta, half* c_buffer, const size_t c_offset, const size_t c_ld) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for CHER2K/ZHER2K cublasStatus_t cublasXher2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, const size_t n, const size_t k, const float2 alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, const float2* b_buffer, const size_t b_offset, const size_t b_ld, const float beta, float2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasCher2k(handle, triangle, ab_transpose, static_cast(n), static_cast(k), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld, &beta, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXher2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, const size_t n, const size_t k, const double2 alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, const double2* b_buffer, const size_t b_offset, const size_t b_ld, const double beta, double2* c_buffer, const size_t c_offset, const size_t c_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasZher2k(handle, triangle, ab_transpose, static_cast(n), static_cast(k), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld, &beta, reinterpret_cast(&c_buffer[c_offset]), c_ld); cudaDeviceSynchronize(); return status; } // Forwards the cuBLAS calls for STRMM/DTRMM/CTRMM/ZTRMM cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t m, const size_t n, const float alpha, const float* a_buffer, const size_t a_offset, const size_t a_ld, float* b_buffer, const size_t b_offset, const size_t b_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasStrmm(handle, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), &alpha, &a_buffer[a_offset], a_ld, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t m, const size_t n, const double alpha, const double* a_buffer, const size_t a_offset, const size_t a_ld, double* b_buffer, const size_t b_offset, const size_t b_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDtrmm(handle, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), &alpha, &a_buffer[a_offset], a_ld, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t m, const size_t n, const float2 alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, float2* b_buffer, const size_t b_offset, const size_t b_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasCtrmm(handle, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t m, const size_t n, const double2 alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, double2* b_buffer, const size_t b_offset, const size_t b_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasZtrmm(handle, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t m, const size_t n, const half alpha, const half* a_buffer, const size_t a_offset, const size_t a_ld, half* b_buffer, const size_t b_offset, const size_t b_ld) { return CUBLAS_STATUS_NOT_SUPPORTED; } // Forwards the cuBLAS calls for STRSM/DTRSM/CTRSM/ZTRSM cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t m, const size_t n, const float alpha, const float* a_buffer, const size_t a_offset, const size_t a_ld, float* b_buffer, const size_t b_offset, const size_t b_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasStrsm(handle, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), &alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t m, const size_t n, const double alpha, const double* a_buffer, const size_t a_offset, const size_t a_ld, double* b_buffer, const size_t b_offset, const size_t b_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } auto status = cublasDtrsm(handle, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), &alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t m, const size_t n, const float2 alpha, const float2* a_buffer, const size_t a_offset, const size_t a_ld, float2* b_buffer, const size_t b_offset, const size_t b_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasCtrsm(handle, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld); cudaDeviceSynchronize(); return status; } cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, const size_t m, const size_t n, const double2 alpha, const double2* a_buffer, const size_t a_offset, const size_t a_ld, double2* b_buffer, const size_t b_offset, const size_t b_ld) { if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } cuDoubleComplex alpha_cuda; alpha_cuda.x = alpha.real(); alpha_cuda.y = alpha.imag(); auto status = cublasZtrsm(handle, side, triangle, a_transpose, diagonal, static_cast(m), static_cast(n), &alpha_cuda, reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld); cudaDeviceSynchronize(); return status; } // ================================================================================================= } // namespace clblast // CLBLAST_TEST_WRAPPER_CUBLAS_H_ #endif CLBlast-1.6.3/test/wrapper_cuda.hpp000066400000000000000000000144761463263031500171700ustar00rootroot00000000000000 // ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains all the CUDA related code; used only in case of testing against cuBLAS // // ================================================================================================= #ifndef CLBLAST_TEST_WRAPPER_CUDA_H_ #define CLBLAST_TEST_WRAPPER_CUDA_H_ #include #include #include #include #include "utilities/utilities.hpp" #ifdef CLBLAST_REF_CUBLAS #define CUDA_NO_HALF #include #include #endif namespace clblast { // ================================================================================================= #ifdef CLBLAST_REF_CUBLAS template void cublasSetup(Arguments &args) { cudaSetDevice(static_cast(args.device_id)); auto status = cublasCreate(reinterpret_cast(&args.cublas_handle)); if (status != CUBLAS_STATUS_SUCCESS) { throw std::runtime_error("CUDA cublasCreate error"); } } #endif #ifdef CLBLAST_REF_CUBLAS template void cublasTeardown(Arguments &args) { auto status = cublasDestroy(reinterpret_cast(args.cublas_handle)); if (status != CUBLAS_STATUS_SUCCESS) { throw std::runtime_error("CUDA cublasDestroy error"); } } #endif // ================================================================================================= // Copies data from the CUDA device to the host and frees-up the CUDA memory afterwards #ifdef CLBLAST_REF_CUBLAS template void CUDAToHost(T** buffer_cuda, std::vector &buffer_host, const size_t size) { auto status1 = cudaMemcpy( reinterpret_cast(buffer_host.data()), reinterpret_cast(*buffer_cuda), size*sizeof(T), cudaMemcpyDeviceToHost ); if (status1 != cudaSuccess) { throw std::runtime_error("CUDA cudaMemcpy error with status: "+ToString(static_cast(status1))); } auto status2 = cudaFree(*buffer_cuda); if (status2 != cudaSuccess) { throw std::runtime_error("CUDA cudaFree error with status: "+ToString(static_cast(status2))); } *buffer_cuda = nullptr; } #else template void CUDAToHost(T**, const std::vector&, const size_t) { } #endif // Allocates space on the CUDA device and copies in data from the host #ifdef CLBLAST_REF_CUBLAS template void HostToCUDA(T** buffer_cuda, std::vector &buffer_host, const size_t size) { if (*buffer_cuda == nullptr) { auto status1 = cudaMalloc(reinterpret_cast(buffer_cuda), size*sizeof(T)); if (status1 != cudaSuccess) { throw std::runtime_error("CUDA cudaMalloc error with status: "+ToString(static_cast(status1))); } } auto status2 = cudaMemcpy( reinterpret_cast(*buffer_cuda), reinterpret_cast(buffer_host.data()), size*sizeof(T), cudaMemcpyHostToDevice ); if (status2 != cudaSuccess) { throw std::runtime_error("CUDA cudaMemcpy error with status: "+ToString(static_cast(status2))); } } #else template void HostToCUDA(T**, const std::vector&, const size_t) { } #endif // ================================================================================================= template struct BuffersCUDA { T* x_vec = nullptr; T* y_vec = nullptr; T* a_mat = nullptr; T* b_mat = nullptr; T* c_mat = nullptr; T* ap_mat = nullptr; T* scalar = nullptr; }; template void CUDAToHost(const Arguments &args, BuffersCUDA &buffers, BuffersHost &buffers_host, const std::vector &names) { for (auto &name: names) { if (name == kBufVecX) { buffers_host.x_vec = std::vector(args.x_size, static_cast(0)); CUDAToHost(&buffers.x_vec, buffers_host.x_vec, args.x_size); } else if (name == kBufVecY) { buffers_host.y_vec = std::vector(args.y_size, static_cast(0)); CUDAToHost(&buffers.y_vec, buffers_host.y_vec, args.y_size); } else if (name == kBufMatA) { buffers_host.a_mat = std::vector(args.a_size, static_cast(0)); CUDAToHost(&buffers.a_mat, buffers_host.a_mat, args.a_size); } else if (name == kBufMatB) { buffers_host.b_mat = std::vector(args.b_size, static_cast(0)); CUDAToHost(&buffers.b_mat, buffers_host.b_mat, args.b_size); } else if (name == kBufMatC) { buffers_host.c_mat = std::vector(args.c_size, static_cast(0)); CUDAToHost(&buffers.c_mat, buffers_host.c_mat, args.c_size); } else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector(args.ap_size, static_cast(0)); CUDAToHost(&buffers.ap_mat, buffers_host.ap_mat, args.ap_size); } else if (name == kBufScalar) { buffers_host.scalar = std::vector(args.scalar_size, static_cast(0)); CUDAToHost(&buffers.scalar, buffers_host.scalar, args.scalar_size); } else { throw std::runtime_error("Invalid buffer name"); } } } template void HostToCUDA(const Arguments &args, BuffersCUDA &buffers, BuffersHost &buffers_host, const std::vector &names) { for (auto &name: names) { if (name == kBufVecX) { HostToCUDA(&buffers.x_vec, buffers_host.x_vec, args.x_size); } else if (name == kBufVecY) { HostToCUDA(&buffers.y_vec, buffers_host.y_vec, args.y_size); } else if (name == kBufMatA) { HostToCUDA(&buffers.a_mat, buffers_host.a_mat, args.a_size); } else if (name == kBufMatB) { HostToCUDA(&buffers.b_mat, buffers_host.b_mat, args.b_size); } else if (name == kBufMatC) { HostToCUDA(&buffers.c_mat, buffers_host.c_mat, args.c_size); } else if (name == kBufMatAP) { HostToCUDA(&buffers.ap_mat, buffers_host.ap_mat, args.ap_size); } else if (name == kBufScalar) { HostToCUDA(&buffers.scalar, buffers_host.scalar, args.scalar_size); } else { throw std::runtime_error("Invalid buffer name"); } } } // ================================================================================================= } // namespace clblast // CLBLAST_TEST_WRAPPER_CUDA_H_ #endif